# Testing out BEDTools analysis steps

This notebook contains code that was used to plan out and troubleshoot issues in the main BEDTools analysis folder. It has the following parts:

- 3A. Cutoffs from intersect files
- 4. Windows
- 5B. Running coverage on test dataset (pcontig_019)
- 5C. 5D. Testing out loop to save coverage files


In [None]:
import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
import numpy # need for p-value stats
import scipy

In [None]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['BED_INPUT'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'sequencing_comparison')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['WINDOW_OUTPUT'] = os.path.join(DIRS['BASE2'], 'windows')
DIRS['WINDOW_INPUT'] = os.path.join(DIRS['BASE2'], 'input_for_windows')
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/'

In [None]:
#Venn diagram
# First way to call the 2 group Venn diagram:
#PacBio sites:
#88932
#Overlapping sites:
#84733
#Nanopore sites:
#83451878

c = 84733
a = 83451878-c
b = 88932-c
venn2(subsets = (a, b, c), set_labels = ('Nanopore', 'Pacbio'))
plt.show()

## <span style='color:#8a14ff'> 3. Making cutoff files. <span/>

### <span style='color:#8a14ff'> 3.A Making cutoff files for overlapping files from previous section. <span/>

In [None]:
%%bash

#Move the tombo hc files to the 'sequencing_comparison' folder with the other overlapped files to continue analysis
cd /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/
cp 5mC_hc_tombo_sorted.bed ~/analysis/bedtools_output/sequencing_comparison/
cp 6mA_hc_tombo_sorted.bed ~/analysis/bedtools_output/sequencing_comparison/

In [None]:
%%bash

#Move the alternative bed intersect files and older tombo-np intersect files to a separate folder
#There will be 4 files remaining that will be used for the rest of the analysis
cd /home/anjuni/analysis/bedtools_output/sequencing_comparison/
mkdir alt_bed
mv 6mA_pb_ont.bed alt_bed
mv *np_tombo* alt_bed
mv 5mC_tombo_np.bed alt_bed/
mv 5mC_CpG_tombo_np.bed alt_bed/

In [None]:
#Make filepaths for both 6mA files, both CpG files, and the tombo file
bed_file_list = ['/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_ont_pb.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_pb_ont.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_tombo_np.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_np_tombo.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_tombo_sorted.bed']

In [None]:
# Make the list of cutoffs
cutoff_list = [1.00, 0.99, 0.95, 0.90, 0.80, 0.70, 0.60, 0.50, 0.40, 0.30, 0.20, 0.10]

In [None]:
# Define function to filter
def score_filter(feature, L):
    """Returns True if feature is longer than L"""
    return float(feature.score) >= L

def filter_by_cutoffs(bed_files, cutoffs, initial_file_path, final_file_path):
    """Filters files by the list of cutoffs given, and renames the file according to the cutoff."""
    for file in bed_files:
        pybed_object = BedTool(file)
        for x in cutoffs:
            filtered_file = pybed_object.filter(score_filter, x)
            cutoff = "{:.2f}".format(x)
            cutoff_name = '.cutoff.' + cutoff + '.bed'
            out_filename = file.replace('.bed', cutoff_name)
            out_filename = out_filename.replace(initial_file_path, final_file_path)
            filtered_file.saveas(out_filename)

In [None]:
#Run the function to filter all files
initial_fp = '/home/anjuni/analysis/bedtools_output/sequencing_comparison/'
final_fp = '/home/anjuni/analysis/bedtools_output/cutoffs_from_intersects/'
filter_by_cutoffs(bed_file_list, cutoff_list, initial_fp, final_fp)

In [None]:
%%bash

### Optional
# Move the original files to this folder as well, and change their names, for the 0.00 cutoff, if it was not done earlier
cd /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_5mC
for x in /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/5*
do
cp ${x} .
done

mv 5mC_hc_nanopolish_sorted.bed 5mC_hc_nanopolish_sorted.cutoff.0.00.bed
mv 5mC_hc_tombo_sorted.CpG.plus.bed 5mC_hc_tombo_sorted.CpG.plus.cutoff.0.00.bed
mv 5mC_hc_tombo_sorted.bed 5mC_hc_tombo_sorted.cutoff.0.00.bed

cd /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA
for y in /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/6*
do
cp ${y} .
done

mv 6mA_prob_smrtlink_sorted.bed 6mA_prob_smrtlink_sorted.cutoff.0.00.bed
mv 6mA_hc_tombo_sorted.bed 6mA_hc_tombo_sorted.cutoff.0.00.bed

In [None]:
### Optional
# Run intersect cutoffs for the original 0-cutoff files post-hoc
ont_final = ['/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA/6mA_hc_tombo_sorted.cutoff.0.00.bed']
pb_final = ['/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA/6mA_prob_smrtlink_sorted.cutoff.0.00.bed']
intersect_cutoffs(ont_final, pb_final, 1, '6mA', 'tombo', 'smrtlink')

tmb_final = ['/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_5mC/5mC_hc_tombo_sorted.CpG.plus.cutoff.0.00.bed']
np_final = ['/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_5mC/5mC_hc_nanopolish_sorted.cutoff.0.00.bed']
intersect_cutoffs(tmb_final, np_final, 1, '5mC', 'tombo', 'nanopolish')

### <span style='color:#8a14ff'> 3.B Making random distributions of the same size as all the cutoff files. <span/>

In [None]:
def subset_sites(all_site_fn, number_of_subset_sites, out_fn):
    """This is a function that creates a BED file containing a subset of the total sites in the genome, with the size of the subset equal to an input cutoff file."""
    df = pd.read_csv(all_site_fn, sep='\t', header = None) #use up memory making a dataframe of the csv file, to avoid running through it for each base in each loop
    random_array = np.random.choice(df.shape[0], number_of_subset_sites)
    random_array.sort()
    df.iloc[random_array, :].to_csv(out_fn, header=None, index=None, sep='\t') #iloc[rows, columns]

In [None]:
# testing out the subset_function

# on an input file
all_site_fn = all_a
number_of_all_sites = count_dict['6mA_tombo_sorted.bed']
number_of_subset_sites = count_dict['6mA_prob_smrtlink_sorted.cutoff.0.99.bed']
out_fn = '6mA_prob_smrtlink_sorted.cutoff.0.99.bed'.replace('.bed', '.rand_1.bed')
subset_sites(all_site_fn, number_of_subset_sites, out_fn)

# on 10 sites only
subset_sites(all_site_fn, 10, 'test_10.bed')

#on a list of subset sizes
subset_sites(all_site_fn, [50, 11], ['test_10.bed','test_11.bed'])

In [None]:
#Alternative to making a dictionary of site count
def count_sites_list(file_list):
    """Returns a list of the number of sites in each cutoff file, in order."""
    count_list = []
    for file in file_list:
        with open(file) as in_file:
            count = 0
            for line in in_file:
                count += 1
        count_list.append(count)
    return count_list

In [None]:
# subset with list instead of dict
def subset_sites(all_site_fn, number_of_subset_sites, out_fn):
    """This is a function that creates a BED file containing a subset of the total sites in the genome, with the size of the subset equal to an input cutoff file."""
    df = pd.read_csv(all_site_fn, sep='\t', header = None) #use up memory making a dataframe of the csv file, to avoid running through it for each base in each loop
    if type(number_of_subset_sites) == int and type(out_fn) == str:
        random_array = np.random.choice(df.shape[0], number_of_subset_sites)
        random_array.sort()
        df.iloc[random_array, :].to_csv(out_fn, header=None, index=None, sep='\t') #iloc[rows, columns]
    elif type(number_of_subset_sites) == list and type(out_fn) == list:
        for n, out in zip(number_of_subset_sites, out_fn): # have sorted lists to zip properly
            random_array = np.random.choice(df.shape[0], n)
            random_array.sort()
            df.iloc[random_array, :].to_csv(out_fn, header=None, index=None, sep='\t') 

In [None]:
# test dict function

test_n = {}
test_n['any'] = intersect_count_dict['6mA_tombo_smrtlink.cutoff.0.99.bed']
test_n['thing'] = intersect_count_dict['6mA_tombo_smrtlink.cutoff.0.95.bed']

test_f = {}
test_f['any'] = '/home/anjuni/analysis/coverage/test_randomisation/6mA_tombo_smrtlink.cutoff.0.99_rand.bed'
test_f['thing'] = '/home/anjuni/analysis/coverage/test_randomisation/6mA_tombo_smrtlink.cutoff.0.95_rand.bed'

all_a = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files', '6mA_tombo_sorted.bed')

new_subset_sites(all_a, test_n, test_f)

In [None]:
# Making dictionaries by group instead of modification
# Make dictionaries of the file name and file size
intersect_count_dict = count_sites(intersect_files)
tombo_count_dict = count_sites(tombo_files)

pprint.pprint(intersect_count_dict)
pprint.pprint(tombo_count_dict)

# Make dictionaries of all out files
intersect_rand_dict = out_rand_files(intersect_files)
tombo_rand_dict = out_rand_files(tombo_files)

pprint.pprint(intersect_rand_dict)
pprint.pprint(tombo_rand_dict)

In [None]:
# Alternate method

# Make a list of cutoff files and tombo files
intersect_files = intersected_5mC + intersected_6mA
tombo_files = hc_tombo_5mC + hc_tombo_6mA

# Make a dictionary of all the input file names and file handles
cutoff_fn_dict = {}
for i in intersect_files:
    cutoff_fn_dict[i.split('/')[-1]] = i
    
tombo_fn_dict = {}
for i in tombo_files:
    tombo_fn_dict[i.split('/')[-1]] = i
    
# Make dictionary for total C, total A, and total (+) strand CpG in genome
all_count_dict = count_sites([all_c, all_a, all_cpg])
print(all_count_dict)

In [None]:
obs = intersect_6mA_dict['6mA_tombo_smrtlink.cutoff.0.99.bed']
exp = intersect_6mA_rand_dict['6mA_tombo_smrtlink.cutoff.0.99.bed']

print(obs)
print(exp)

df1 = pd.read_csv(obs, sep='\t', header = None)
df2 = pd.read_csv(obs, sep='\t', header = None)

df1.head()

obss = df1[4]
expp = df2[4]

len(expp)

stat, p = chisquare(obss, expp)
print('Statistics=%.3f, p=%.3f' % (stat, p))

dict1 = {}
dict1['name'] = stat, p

dict1['name']

stat = chisquare(obss, expp)

## <span style='color:#144fff'> 4. Making windows. <span/>

In [None]:
# Define all file paths for window BED files
window_fn_dict = {}
window_bed_dict = {}
window_fn_dict['100kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb.bed')
window_fn_dict['30kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb.bed')
window_fn_dict['10kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb.bed')
window_fn_dict['100kb_s20kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb_s20kb.bed')
window_fn_dict['30kb_s6kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb_s6kb.bed')
window_fn_dict['10kb_s2kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb_s2kb.bed')
genome_size_f_fn = os.path.join(DIRS['WINDOW_INPUT'], 'Pst_104E_v13_ph_ctg.sorted.genome_file')

## <span style='color:#148aff'> 5. Coverage analysis of methylation with gene annotation files. <span/>

### <span style='color:#148aff'> 5.B Run converage analysis on test dataset (pcontig_019). <span/>

In [None]:
# Make filepaths for feature files for genes, effectors, TE, methylation
feature_fn_dict = {}
feature_fn_dict['genes'] = gene_fn
feature_fn_dict['TE'] = te_fn
feature_fn_dict['effector'] = os.path.join(DIRS['WINDOW_INPUT'], 'Pst_104E_v13_ph_ctg.effectors.gff3' )
feature_fn_dict['ont_6mA_0.10'] = ont_6mA[0]
feature_fn_dict['pb_6mA_0.10'] = pb_6mA[0]

In [None]:
# Check whether the function file dictionary works (it does)
pprint.pprint(feature_fn_dict)

In [None]:
# Make a dictionary of feature files
feature_bed_dict = {}
for key, value in feature_fn_dict.items():
    feature_bed_dict[key] = BedTool(value)
    
# Check whether the function bed dictionary works (it does)
pprint.pprint(feature_bed_dict)

In [None]:
%%bash
# Make a subset of windows from pcontig_019 as a test dataset
cd /home/anjuni/analysis/windows/
for x in *.bed
do
len=${#x}
name=${x::len-4}
echo ${name}
grep 'pcontig_019' ${x} > test_windows/${name}.pcontig_019.bed
done

In [None]:
# Make a filepath dictionary and a bed file dictionary of the test windows
test_window_fn_dict = {}
test_window_fn_dict['100kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed')
test_window_fn_dict['10kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w10kb.pcontig_019.bed')
test_window_fn_dict['30kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w30kb.pcontig_019.bed')

test_window_bed_dict = {}
for key, value in test_window_fn_dict.items():
    test_window_bed_dict[key] = BedTool(value)

pprint.pprint(test_window_bed_dict)

In [None]:
# Make a dictionary of sliding windows
test_sliding_window_fn_dict = {}
test_sliding_window_fn_dict['100kb_s20kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w100kb_s20kb.pcontig_019.bed')
test_sliding_window_fn_dict['10kb_s2kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w10kb_s2kb.pcontig_019.bed')
test_sliding_window_fn_dict['30kb_s6kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w30kb_s6kb.pcontig_019.bed')

test_sliding_window_bed_dict = {}
for key, value in test_sliding_window_fn_dict.items():
    test_sliding_window_bed_dict[key] = BedTool(value)
    
pprint.pprint(test_sliding_window_bed_dict)

In [None]:
%%bash
# Test out overlaps for test dataset on command line, to see what output looks like (works)
cd /home/anjuni/analysis/windows/test_windows
features=/home/anjuni/analysis/gff_output
methyl=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA
ont_6mA_100kb=100kb_6mA_hc_tombo_0.10.bed
pb_6mA_100kb=100kb_6mA_prob_smrtlink_0.10.bed

coverageBed -a Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed -b ${methyl}/6mA_hc_tombo_sorted.cutoff.0.10.bed > 100kb_6mA_hc_tombo_0.10.bed

In [None]:
%%bash
# Test out the histogram function in coverageBed, to see what output looks like. No need to use it
cd /home/anjuni/analysis/windows/test_windows
features=/home/anjuni/analysis/gff_output
methyl=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA
coverageBed -a Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed -b ${methyl}/6mA_hc_tombo_sorted.cutoff.0.10.bed -hist > h100kb_6mA_hc_tombo_0.10.bed

# It just puts a row for all(?) at the bottom?

### <span style='color:#148aff'> 5.C Testing out Ben's pybedtools coverage function. <span/>

In [None]:
# Test out Ben's function to see if it's easier?
# make a dataframe to put headings
# (the function kwarg .coverage(F=0.1) indicates minimum fraction overlap)
tmp_df = test_window_bed_dict['100kb'].coverage(test_feature_fn_dict['ont_6mA_0.10']).to_dataframe().iloc[:,[0,1,2,3,6]]

In [None]:
# check dataframe
tmp_df.head()

In [None]:
# rename headings
tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True)
tmp_df.head()

In [None]:
# change output file path
tmp_fn = test_feature_fn_dict['ont_6mA_0.10'].replace('.bed', '.%s.overlap.bed' % '100kb')
tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
print(tmp_fn)

In [None]:
# make a dictionary for overlap file name as key and dataframe as value
feature_overlap_df_dict = {}
feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df
pprint.pprint(feature_overlap_df_dict)

In [None]:
# save to a csv (note: pybedtools has more decimal places than bash bedtools)
tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # no headers or row names in csv

### <span style='color:#148aff'> 5.D Using Ben's pybedtools coverage function on test window dataset. <span/>

In [None]:
# Make dictionary of test feature files to make the function easier
# The necessary feature files were moved to their own folder first
# Will need to move all feature files to a folder for actual analysis

DIRS['TEST_COV'] = os.path.join(DIRS['BASE2'], 'coverage/test_feature_files')
test_feature_fn_dict = {}
test_feature_fn_dict['genes'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.anno.sorted.gff3')
test_feature_fn_dict['TE'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.TE.sorted.gff3')
test_feature_fn_dict['effector'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.effectors.gff3')
test_feature_fn_dict['ont_6mA_0.10'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.10.bed')
test_feature_fn_dict['pb_6mA_0.10'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.10.bed')
test_feature_fn_dict['tmb_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.10.bed')
test_feature_fn_dict['tmb_cpg_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.10.bed')
test_feature_fn_dict['np_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.10.bed')

# test dict
pprint.pprint(test_feature_fn_dict)

In [None]:
# Make a dictionary of bed test feature files and view
test_feature_bed_dict = {}
for key, value in test_feature_fn_dict.items():
    test_feature_bed_dict[key] = BedTool(value)

pprint.pprint(test_feature_bed_dict)

In [None]:
# so the gff3 files don't get renamed properly, so I may need to make a new loop for them, or add an "if" line for them
test_feature_overlap_df_dict = {}
for wkey, wbed in test_window_bed_dict.items():
    for fkey, fbed in test_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        if tmp_fn.endswith('.bed'):
            tmp_fn = test_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        if tmp_fn.endswith('.gff3'):
            tmp_fn = test_feature_fn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
        test_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)

In [None]:
# The methylation files were too low-quality, so coverage for higher quality methylation data was collected :)
hc_feature_fn_dict = {}
hc_feature_fn_dict['ont_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['ont_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.50.bed') 
hc_feature_fn_dict['pb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['pb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_cpg_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.50.bed')
hc_feature_fn_dict['np_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['tmb_cpg_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.90.bed')
hc_feature_fn_dict['np_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.90.bed')                                          

In [None]:
hc_feature_bed_dict = {}
for key, value in hc_feature_fn_dict.items():
    hc_feature_bed_dict[key] = BedTool(value)

pprint.pprint(hc_feature_bed_dict)

In [None]:
# run coverage for higher stringency files
hc_test_feature_overlap_df_dict = {}
for wkey, wbed in test_window_bed_dict.items():
    for fkey, fbed in hc_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        tmp_fn = hc_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
        hc_test_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)

In [None]:
# Running coverage for sliding windows
# Make dictionary of test feature files to make the function easier
# The necessary feature files were moved to their own folder first
# Will need to move all feature files to one folder for actual analysis

DIRS['TEST_COV'] = os.path.join(DIRS['BASE2'], 'coverage/test_feature_files')
test_sliding_feature_fn_dict = {}
test_sliding_feature_fn_dict['genes'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.anno.sorted.gff3')
test_sliding_feature_fn_dict['TE'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.TE.sorted.gff3')
test_sliding_feature_fn_dict['effector'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.effectors.gff3')
test_sliding_feature_fn_dict['tmb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['tmb_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['tmb_cpg_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.90.bed')
test_sliding_feature_fn_dict['np_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_cpg_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.50.bed')
test_sliding_feature_fn_dict['np_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['pb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['pb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.50.bed')

In [None]:
# Convert file paths to bed files
test_sliding_feature_bed_dict = {}
for key, value in test_sliding_feature_fn_dict.items():
    test_sliding_feature_bed_dict[key] = BedTool(value)

pprint.pprint(test_sliding_feature_bed_dict)

In [None]:
test_sliding_feature_overlap_df_dict = {}
for wkey, wbed in test_sliding_window_bed_dict.items():
    for fkey, fbed in test_sliding_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        if tmp_fn.endswith('.bed'): # for methylation files
            tmp_fn = test_sliding_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        if tmp_fn.endswith('.gff3'): # for gene/transposon/effectors file
            tmp_fn = test_sliding_feature_fn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_sliding_coverage')
        test_sliding_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)

In [None]:
# Run overlaps between windows and features
feature_overlap_df_dict = {}
for wkey, wbed in window_bed_dict.items():
    for fkey, fbed in feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed, F=0.1).to_dataframe().iloc[:,[0,1,2,3,6]] #(F=0.1 indicates minimum fraction overlap)
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True)
        tmp_fn = feature_fn_dict[fkey].replace('bed', '%s.overlap.bed' % wkey)
        feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None)
        tmp_fn = feature_fn_dict[fkey].replace('bed', '%s.overlap.circabed' % wkey)
        tmp_df.to_csv(tmp_fn, sep='\t', index=None)

In [None]:
# Make list of file names to be dictionary keys and check if it worked (it did!)
methylation_file_names = []
for file in all_methylation_files:
    name = file[58:]
    methylation_file_names.append(name)
methylation_file_names.sort()
   
pprint.pprint(methylation_file_names) 

In [None]:
# Make dictionary of bedtools objects and check if it worked (it did!)
all_methylation_bed_dict = {}
for i in range(len(all_methylation_files)):
    all_methylation_bed_dict[methylation_file_names[i]] = BedTool(all_methylation_files[i])
    
pprint.pprint(all_methylation_bed_dict)    