# <span style="color:plum"> BEDtools analysis. </span>

This is a script to answer research questions outlined elsewhere. In summary, this script:

1. compares methylation results between different methylation-callers, and between different methylation sequencing methods.

2. compares methylation between genes and non-gene regions

3. compares methylation between transposons and non-repetitive regions

4. compares transposons and genes

In [1]:
import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
import numpy # need for p-value stats
import scipy

In [5]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['BED_INPUT'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', 'filtered_bed')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['WINDOW_OUTPUT'] = os.path.join(DIRS['BASE2'], 'windows')

In [6]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [8]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]
gff_file_list = [fn for fn in glob.iglob('%s/*anno.gff3' % DIRS['GFF_INPUT'], recursive=True)]
te_file_list = [fn for fn in glob.iglob('%s/*.gff' % DIRS['GFF_INPUT'], recursive=True)]

In [9]:
#Check that the list works
print(*bed_file_list, sep='\n')
print(*gff_file_list, sep='\n')
print(*te_file_list, sep='\n')

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.5.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_tombo_sorted.cutoff.0.4.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.6.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_prob_smrtlink_sorted.cutoff.0.95.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.1.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_tombo_sorted.cutoff.0.2.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.9.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_tombo_sorted.cutoff.1.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.4.bed
/home/an

In [10]:
# maybe don't use these coz the keys look bad haha
BED = {}
for file in bed_file_list:
    name = (file[63:-4])
    bed_file = BedTool(file)
    BED[name] = bed_file

In [11]:
pprint.pprint(BED) # see if it works

{'filtered_bed/5mC_nanopolish_sorted.cutoff.0.1': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.1.bed)>,
 'filtered_bed/5mC_nanopolish_sorted.cutoff.0.2': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.2.bed)>,
 'filtered_bed/5mC_nanopolish_sorted.cutoff.0.3': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.3.bed)>,
 'filtered_bed/5mC_nanopolish_sorted.cutoff.0.4': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.4.bed)>,
 'filtered_bed/5mC_nanopolish_sorted.cutoff.0.5': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/5mC_nanopolish_sorted.cutoff.0.5.bed)>,
 'filtered_bed/5mC_nanopolish_sorted.cutoff.0.6': <BedTool(/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/fil

In [18]:
GFF = {}
for file in gff_file_list:
    name = (file[46:-5])
    gff_file = BedTool(file)
    GFF[name] = gff_file

In [19]:
pprint.pprint(GFF) # see if it works

{'h_ctg_combined_sorted_anno': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_h_ctg_combined_sorted_anno.gff3)>,
 'p_ctg_combined_sorted_anno': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_p_ctg_combined_sorted_anno.gff3)>,
 'ph_ctg_combined_sorted_anno': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg_combined_sorted_anno.gff3)>}


In [20]:
TE = {}
for file in te_file_list:
    name = (file[46:-4])
    te_file = BedTool(file)
    TE[name] = te_file

In [21]:
# might have to sort these TE files by position. ask Ben if they are
pprint.pprint(TE) # see if it works 

{'h_ctg.REPET.sorted.filtered.superfamily': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_h_ctg.REPET.sorted.filtered.superfamily.gff)>,
 'p_ctg.REPET.sorted.filtered.superfamily': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_p_ctg.REPET.sorted.filtered.superfamily.gff)>,
 'ph_ctg.REPET.sorted.filtered.superfamily': <BedTool(/home/anjuni/analysis/gff_output/Pst_104E_v13_ph_ctg.REPET.sorted.filtered.superfamily.gff)>}


## <span style='color:deeppink'> 1. Comparing methylation sequencing methods <span/>

In [36]:
%%bash

# find overlap between 6mA from PacBio and Nanopore

pb=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_prob_smrtlink_sorted.cutoff.0.99.bed
ont=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.99.bed
out1=/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_pb_ont_cutoff_0.99.bed
out2=/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_ont_pb_cutoff_0.99.bed
echo $pb
echo $ont

bedtools intersect -a $pb -b $ont > $out1
bedtools intersect -a $ont -b $pb > $out2

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_prob_smrtlink_sorted.cutoff.0.99.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.99.bed


In [37]:
%%bash

#check how many overlapping sites there were

cd /home/anjuni/analysis/bedtools_output/sequencing_comparison/
less 6mA_pb_ont_cutoff_0.99.bed | wc
less 6mA_ont_pb_cutoff_0.99.bed | wc

    507    3042   25373
    507    3042   18730


In [38]:
%%bash

# try again with all the tombo sites

pb=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_prob_smrtlink_sorted.cutoff.0.99.bed
ont=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.1.bed
out1=/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_pb_ont_cutoff_0.1.bed
out2=/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_ont_pb_cutoff_0.1.bed
echo $pb
echo $ont

bedtools intersect -a $pb -b $ont > $out1
bedtools intersect -a $ont -b $pb > $out2

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_prob_smrtlink_sorted.cutoff.0.99.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/6mA_tombo_sorted.cutoff.0.1.bed


In [42]:
88932/(153000000 / 4)

0.002325019607843137

random set overlapping the A's in genome. chi squared test

In [39]:
%%bash

#check how many overlapping sites there were

cd /home/anjuni/analysis/bedtools_output/sequencing_comparison/
less 6mA_pb_ont_cutoff_0.1.bed | wc
less 6mA_ont_pb_cutoff_0.1.bed | wc

  77694  466164 3937561
  77694  466164 3100531


There are more sites when using the low confidence sites from tombo, compared to only using only high confidence sites from both

## <span style='color:limegreen'> 2. Comparing methylation detection methods <span/>