# <span style="color:#ff1414"> BEDtools analysis. </span>

This is a script to answer research questions outlined elsewhere. In summary, this script:

1. compares methylation results between different methylation-callers, and between different methylation sequencing methods.

2. compares methylation between genes and non-gene regions

3. compares methylation between transposons and non-repetitive regions

4. compares transposons and genes


Note:
- PB/pb = PacBio
- ONT/ont = Oxford Nanopore Technology
- NP = Nanopolish

In [2]:
import pybedtools
import scipy

import matplotlib.patches as mpatches
import pandas as pd
import numpy as np # need for  stats

from scipy.stats import wilcoxon
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

In [3]:
# load modules
import os
import glob
import pprint
from pybedtools import BedTool
from scipy.stats import spearmanr

In [168]:

DIRS['TE_SF'] = os.path.join(DIRS['COVERAGE'], 'superfamily_files')
DIRS['RNA'] = os.path.join(DIRS['BASE2'], 'rna_counts')
DIRS['edgeR'] = os.path.join(DIRS['RNA'], 'edgeR_output')
DIRS['TRIALS'] = os.path.join(DIRS['RNA'], 'trials_tsv')
DIRS['MEAN_STD'] = os.path.join(DIRS['RNA'], 'average_and_stdev')
DIRS['GENE'] = os.path.join(DIRS['COVERAGE'], 'gene_level')
DIRS['GENE_BODY'] = os.path.join(DIRS['GENE'], 'gene_body')
DIRS['BOTH_U_D'] = os.path.join(DIRS['GENE'], 'both_upstream_downstream')
DIRS['DOWN_STR'] = os.path.join(DIRS['GENE'], 'downstream')
DIRS['UP_STR'] = os.path.join(DIRS['GENE'], 'upstream')
DIRS['TSS'] = os.path.join(DIRS['GENE'], 'tss_6mA_only')

In [7]:
DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'

DIRS['BED_INPUT'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'sequencing_comparison')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['FIGURES'] = os.path.join(DIRS['BASE2'], 'figures')

DIRS['I_FROM_C'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'intersects_from_cutoffs')
DIRS['BED_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs')
DIRS['6MA_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs_6mA')
DIRS['5MC_CUTOFFS'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'cutoffs_5mC')

In [6]:
#First we need to define the base dirs
DIRS = {}
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['FEATURES'] = os.path.join(DIRS['BASE2'], 'coverage', 'feature_files')
DIRS['RAND'] = os.path.join(DIRS['BASE2'], 'coverage', 'randomisation')
DIRS['WINDOW_OUTPUT'] = os.path.join(DIRS['BASE2'], 'windows')
DIRS['WINDOW_INPUT'] = os.path.join(DIRS['BASE2'], 'input_for_windows')
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/'
DIRS['COVERAGE'] = os.path.join(DIRS['BASE2'], 'coverage')

In [9]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs does not exist


In [79]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]
gff_file_list = [fn for fn in glob.iglob('%s/*anno.gff3' % DIRS['GFF_INPUT'], recursive=True)]
te_file_list = [fn for fn in glob.iglob('%s/*.gff' % DIRS['GFF_INPUT'], recursive=True)]

In [80]:
#Check that the list works
print(*bed_file_list, sep='\n')
print(*gff_file_list, sep='\n')
print(*te_file_list, sep='\n')

/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_np_tombo.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_tombo_sorted.CpG.plus.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_s_nanopolish.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nanopolish_rerun_subtract.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_nanopolish_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_plus_CpG_tombo_np.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/nano_plus_tombo_overlap.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_tombo_sorted.bed
/home/anjuni/analysis/bedtools_output/sequencing_comparison/5m

## <span style='color:#ffa347'> 10. Expression of methylation machinery throughout Pst life cycle. <span/>

DNMT1/MASC2
query: XP_001833175.2, len = 1253

h_subject: Pst104E_20230, len = 1248

Score = 206 bits (523), Expect = 5e-54, Method: Compositional matrix adjust. Identities = 241/925 (26%), Positives = 372/925 (40%), Gaps = 178/925 (19%)

p_subject: Pst104E_04293, len = 1248

Score = 206 bits (523), Expect = 5e-54, Method: Compositional matrix adjust. Identities = 241/925 (26%), Positives = 372/925 (40%), Gaps = 178/925 (19%)

RAD8
query: XP_001831325.2, len = 2184

h_subject: Pst104E_28179, len = 2204

Score = 1158 bits (2996), Expect = 0.0, Method: Compositional matrix adjust. Identities = 643/1386 (46%), Positives = 837/1386 (60%), Gaps = 76/1386 (5%)

p_subject: Pst104E_12497, len = 1248

DNMT1/MASC2
- h_subject: Pst104E_20230   //   gene_model_hcontig_0009_24.226
- p_subject: Pst104E_04293   //   gene_model_pcontig_009.363
    
RAD8
- h_subject: Pst104E_28179   //   gene_model_hcontig_0052_06.60
- p_subject: Pst104E_12497   //   gene_model_pcontig_052.64


In [None]:
# write this function later!!
def grep_gene_name()

In [168]:
# make dicts of BLAST results
blast_h = { 'DNMT1/MASC2' : 'gene_model_hcontig_0009_24.226', 'RAD8' : 'gene_model_hcontig_0052_06.60' }
blast_p = { 'DNMT1/MASC2' : 'gene_model_pcontig_009.363', 'RAD8' : 'gene_model_pcontig_052.64' }

In [169]:
# make rnaseq dict
rnaseq_df_dict = {}
for fn in glob.iglob('%s/*_gene_rpkm_average.tsv' % DIRS['MEAN_STD'], recursive=True):
    if len(fn) == 87:
        df = pd.read_csv(fn, header=0, sep='\t')
        rnaseq_df_dict[fn.split('/')[-1]] = df

In [180]:
rnaseq_fn_dict = {}
for fn in glob.iglob('%s/*_gene_rpkm_average.tsv' % DIRS['MEAN_STD'], recursive=True):
    if len(fn) == 87:
        rnaseq_fn_dict[fn.split('/')[-1]] = fn

In [181]:
rnaseq_fn_dict

{'Pst_104E_v13_h_gene_rpkm_average.tsv': '/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_h_gene_rpkm_average.tsv',
 'Pst_104E_v13_p_gene_rpkm_average.tsv': '/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_p_gene_rpkm_average.tsv'}

In [166]:
DIRS['MEAN_STD']

'/home/anjuni/analysis/rna_counts/average_and_stdev'

In [171]:
rnaseq_df_dict['Pst_104E_v13_h_gene_rpkm_average.tsv'].head()

Unnamed: 0,gene_ID,GS,HE,IT0,IT6,IT9,UG
0,gene_model_hcontig_0000_03.1,0.487427,0.0,0.0,0.0,2.377196,0.384531
1,gene_model_hcontig_0000_03.2,0.577405,144.208849,0.0,1016.181477,456.791691,0.44698
2,gene_model_hcontig_0000_03.3,564.783136,52.086774,24.713948,235.779282,239.133947,316.475087
3,gene_model_hcontig_0000_03.4,0.0,0.167865,0.0,0.115347,0.179739,0.0
4,EVM prediction%2hcontig_0000_003.5,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
for rkey, rvalue in rnaseq_fn.items():
    df = pd.read_csv(rvalue, sep='\t', header = 0)

df.head()
type(df)

pandas.core.frame.DataFrame

In [173]:
def blast_exp_df(blast_dict, rnaseq_df):
    for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_

SyntaxError: invalid syntax (<ipython-input-173-bc07e9909af0>, line 3)

In [189]:
for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_fn.items():
            df = pd.read_csv(rvalue, sep='\t', header = 0)
            for index, row in df.iterrows():
                if bvalue == row['gene_ID']:
                    pprint.pprint(row)

gene_ID    gene_model_hcontig_0009_24.226
GS                                14.0442
HE                                 10.199
IT0                                     0
IT6                               8.27879
IT9                               13.4791
UG                                8.75372
Name: 5163, dtype: object
gene_ID    gene_model_hcontig_0052_06.60
GS                               25.5531
HE                               5.27593
IT0                              4.24689
IT6                              23.0232
IT9                              43.3039
UG                               23.2504
Name: 14740, dtype: object


In [2554]:
for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_fn.items():
            df = pd.read_csv(rvalue, sep='\t', header = 0)
            for index, row in df.iterrows():
                if bvalue == row['gene_ID']:
                    pprint.pprint(row)

gene_ID    gene_model_pcontig_009.363
GS                             12.535
HE                            9.15761
IT0                                 0
IT6                           7.68013
IT9                           12.3386
UG                            7.73364
Name: 5174, dtype: object
gene_ID    gene_model_pcontig_052.64
GS                           22.8899
HE                           4.73403
IT0                          3.78453
IT6                          21.4138
IT9                          39.8084
UG                           20.6473
Name: 15076, dtype: object


In [199]:
blast_dict = blast_h
rnaseq_fn = {}
rnaseq_fn['Pst_104E_v13_h_gene_rpkm_average.tsv'] = rnaseq_fn_dict['Pst_104E_v13_h_gene_rpkm_average.tsv']

In [None]:
# getting stdev

In [194]:
rnaseq_fn_dict = {}
for fn in glob.iglob('%s/*_gene_rpkm_stdev.tsv' % DIRS['MEAN_STD'], recursive=True):
    if len(fn) == 85:
        rnaseq_fn_dict[fn.split('/')[-1]] = fn

In [195]:
rnaseq_fn_dict

{'Pst_104E_v13_h_gene_rpkm_stdev.tsv': '/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_h_gene_rpkm_stdev.tsv',
 'Pst_104E_v13_p_gene_rpkm_stdev.tsv': '/home/anjuni/analysis/rna_counts/average_and_stdev/Pst_104E_v13_p_gene_rpkm_stdev.tsv'}

In [200]:
for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_fn.items():
            df = pd.read_csv(rvalue, sep='\t', header = 0)
            for index, row in df.iterrows():
                if bvalue == row['gene_ID']:
                    pprint.pprint(row)

gene_ID    gene_model_hcontig_0009_24.226
GS                               0.710032
HE                               0.244654
IT0                                     0
IT6                               1.12791
IT9                               1.75511
UG                                1.15206
Name: 5163, dtype: object
gene_ID    gene_model_hcontig_0052_06.60
GS                               2.11856
HE                              0.382341
IT0                              6.00601
IT6                              2.55504
IT9                              11.9945
UG                              0.954619
Name: 14740, dtype: object


In [201]:
blast_dict = blast_p
rnaseq_fn = {}
rnaseq_fn['Pst_104E_v13_p_gene_rpkm_average.tsv'] = rnaseq_fn_dict['Pst_104E_v13_p_gene_rpkm_average.tsv']

In [184]:
rnaseq_fn

{'Pst_104E_v13_h_gene_rpkm_average.tsv':                                    gene_ID          GS          HE        IT0  \
 0             gene_model_hcontig_0000_03.1    0.487427    0.000000   0.000000   
 1             gene_model_hcontig_0000_03.2    0.577405  144.208849   0.000000   
 2             gene_model_hcontig_0000_03.3  564.783136   52.086774  24.713948   
 3             gene_model_hcontig_0000_03.4    0.000000    0.167865   0.000000   
 4       EVM prediction%2hcontig_0000_003.5    0.000000    0.000000   0.000000   
 5       EVM prediction%2hcontig_0000_003.6    0.000000    0.000000   0.000000   
 6             gene_model_hcontig_0000_03.7   10.457924    0.176728   0.000000   
 7             gene_model_hcontig_0000_03.8    0.000000   31.659461   0.000000   
 8             gene_model_hcontig_0000_03.9    0.005005    0.000000   0.000000   
 9            gene_model_hcontig_0000_03.10    0.000000    0.000000   0.000000   
 10           gene_model_hcontig_0000_03.11    0.035380   

In [202]:
for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_fn.items():
            df = pd.read_csv(rvalue, sep='\t', header = 0)
            for index, row in df.iterrows():
                if bvalue == row['gene_ID']:
                    pprint.pprint(row)

gene_ID    gene_model_pcontig_009.363
GS                           0.626692
HE                           0.208539
IT0                                 0
IT6                           1.11119
IT9                            1.5958
UG                            1.02412
Name: 5174, dtype: object
gene_ID    gene_model_pcontig_052.64
GS                           1.90044
HE                           0.33067
IT0                          5.35214
IT6                          2.06825
IT9                          11.0626
UG                          0.833817
Name: 15076, dtype: object


In [218]:
# getting all values
rnaseq_fn_dict = {}
for fn in glob.iglob('%s/*_gene_repRpkmMatrix_featureCounts.tsv' % DIRS['TRIALS'], recursive=True):
    if len(fn) == 95:
        rnaseq_fn_dict[fn.split('/')[-1]] = fn

In [219]:
rnaseq_fn_dict

{'Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.tsv': '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.tsv',
 'Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.tsv': '/home/anjuni/analysis/rna_counts/trials_tsv/Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.tsv'}

In [220]:
blast_dict = blast_h
rnaseq_fn = {}
rnaseq_fn['Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.tsv'] = rnaseq_fn_dict['Pst_104E_v13_h_gene_repRpkmMatrix_featureCounts.tsv']

In [222]:
blast_dict = blast_p
rnaseq_fn = {}
rnaseq_fn['Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.tsv'] = rnaseq_fn_dict['Pst_104E_v13_p_gene_repRpkmMatrix_featureCounts.tsv']

In [223]:
for bkey, bvalue in blast_dict.items():
        for rkey, rvalue in rnaseq_fn.items():
            df = pd.read_csv(rvalue, sep='\t', header = 0)
            for index, row in df.iterrows():
                if bvalue == row['gene_ID']:
                    pprint.pprint(row)

gene_ID    gene_model_pcontig_009.363
GS_1                          13.4194
GS_2                          12.0437
GS_3                          12.1418
HE_1                          9.45151
HE_2                          9.03185
HE_3                          8.98946
IT0_1                               0
IT0_2                               0
IT0_3                               0
IT6_1                         8.02585
IT6_2                         8.83485
IT6_3                          6.1797
IT9_1                         10.5812
IT9_2                          11.991
IT9_3                         14.4435
UG_1                          8.34558
UG_2                          6.29084
UG_3                          8.56451
Name: 5174, dtype: object
gene_ID    gene_model_pcontig_052.64
GS_1                         24.0007
GS_2                          20.215
GS_3                         24.4539
HE_1                         4.28587
HE_2                         5.07378
HE_3                         4

In [None]:
# pull out the gene names for these locus tags from the annotation files

# test for significant difference between alleles on both contigs

## <span style='color:#ff4f14'>  12. Data processing for tables and graphs. <span/>

In [None]:
# Make cutoff files for Nanopolish and SMRTLink, for the table

# Saving out a csv file of cutoff vs. num_sites_file1, num_sites_file2, num_overlap_sites, %_overlap for each initial file
# Already done for tombo 6mA and all overlaps, need to do for NP, Tombo CpG and PB cutoff files
# The csv will be formatted in excel :)

# Ask Ben how to format headings properly

header_6mA = ['Cutoff', 'Nanopore', 'PacBio', 'Overlapping sites', 'Percentage Overlap']

header_5mC = ['Cutoff', 'Tombo', 'Nanopolish', 'Overlapping sites', 'Percentage Overlap']

for akey, avalue in dict_a.items():
    for bkey, bvalue in dict_b.items():
        linakey= []
        if akey == bkey:
            cutoff = akey[x:x]
            line.append(cutoff, avalue, bvalue)
            