# Testing out BEDTools analysis steps

This notebook contains code that was used to plan out and troubleshoot issues in the main BEDTools analysis folder. It has the following parts:

- 3A. Cutoffs from intersect files
- 4. Windows
- 5B. Running coverage on test dataset (pcontig_019)
- 5C. 5D. Testing out loop to save coverage files


In [None]:
import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
import numpy # need for p-value stats
import scipy

In [None]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['BED_INPUT'] = os.path.join(DIRS['BASE2'], 'bedtools_output', 'sequencing_comparison')
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE2'], 'gff_output')
DIRS['WINDOW_OUTPUT'] = os.path.join(DIRS['BASE2'], 'windows')
DIRS['WINDOW_INPUT'] = os.path.join(DIRS['BASE2'], 'input_for_windows')
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/'

## <span style='color:#8a14ff'> 3. Making cutoff files. <span/>

### <span style='color:#8a14ff'> 3.A Making cutoff files for overlapping files from previous section. <span/>

In [None]:
%%bash

#Move the tombo hc files to the 'sequencing_comparison' folder with the other overlapped files to continue analysis
cd /home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/
cp 5mC_hc_tombo_sorted.bed ~/analysis/bedtools_output/sequencing_comparison/
cp 6mA_hc_tombo_sorted.bed ~/analysis/bedtools_output/sequencing_comparison/

In [None]:
%%bash

#Move the alternative bed intersect files and older tombo-np intersect files to a separate folder
#There will be 4 files remaining that will be used for the rest of the analysis
cd /home/anjuni/analysis/bedtools_output/sequencing_comparison/
mkdir alt_bed
mv 6mA_pb_ont.bed alt_bed
mv *np_tombo* alt_bed
mv 5mC_tombo_np.bed alt_bed/
mv 5mC_CpG_tombo_np.bed alt_bed/

In [None]:
#Make filepaths for both 6mA files, both CpG files, and the tombo file
bed_file_list = ['/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_ont_pb.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/6mA_pb_ont.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_tombo_np.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_CpG_np_tombo.bed', \
                 '/home/anjuni/analysis/bedtools_output/sequencing_comparison/5mC_hc_tombo_sorted.bed']

In [None]:
# Make the list of cutoffs
cutoff_list = [1.00, 0.99, 0.95, 0.90, 0.80, 0.70, 0.60, 0.50, 0.40, 0.30, 0.20, 0.10]

In [None]:
# Define function to filter
def score_filter(feature, L):
    """Returns True if feature is longer than L"""
    return float(feature.score) >= L

def filter_by_cutoffs(bed_files, cutoffs, initial_file_path, final_file_path):
    """Filters files by the list of cutoffs given, and renames the file according to the cutoff."""
    for file in bed_files:
        pybed_object = BedTool(file)
        for x in cutoffs:
            filtered_file = pybed_object.filter(score_filter, x)
            cutoff = "{:.2f}".format(x)
            cutoff_name = '.cutoff.' + cutoff + '.bed'
            out_filename = file.replace('.bed', cutoff_name)
            out_filename = out_filename.replace(initial_file_path, final_file_path)
            filtered_file.saveas(out_filename)

In [None]:
#Run the function to filter all files
initial_fp = '/home/anjuni/analysis/bedtools_output/sequencing_comparison/'
final_fp = '/home/anjuni/analysis/bedtools_output/cutoffs_from_intersects/'
filter_by_cutoffs(bed_file_list, cutoff_list, initial_fp, final_fp)

## <span style='color:#144fff'> 4. Making windows. <span/>

In [None]:
# Define all file paths for window BED files
window_fn_dict = {}
window_bed_dict = {}
window_fn_dict['100kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb.bed')
window_fn_dict['30kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb.bed')
window_fn_dict['10kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb.bed')
window_fn_dict['100kb_s20kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w100kb_s20kb.bed')
window_fn_dict['30kb_s6kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w30kb_s6kb.bed')
window_fn_dict['10kb_s2kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'Pst_104E_v13_ph_ctg_w10kb_s2kb.bed')
genome_size_f_fn = os.path.join(DIRS['WINDOW_INPUT'], 'Pst_104E_v13_ph_ctg.sorted.genome_file')

## <span style='color:#148aff'> 5. Coverage analysis of methylation with gene annotation files. <span/>

### <span style='color:#148aff'> 5.B Run converage analysis on test dataset (pcontig_019). <span/>

In [None]:
# Make filepaths for feature files for genes, effectors, TE, methylation
feature_fn_dict = {}
feature_fn_dict['genes'] = gene_fn
feature_fn_dict['TE'] = te_fn
feature_fn_dict['effector'] = os.path.join(DIRS['WINDOW_INPUT'], 'Pst_104E_v13_ph_ctg.effectors.gff3' )
feature_fn_dict['ont_6mA_0.10'] = ont_6mA[0]
feature_fn_dict['pb_6mA_0.10'] = pb_6mA[0]

In [None]:
# Check whether the function file dictionary works (it does)
pprint.pprint(feature_fn_dict)

In [None]:
# Make a dictionary of feature files
feature_bed_dict = {}
for key, value in feature_fn_dict.items():
    feature_bed_dict[key] = BedTool(value)
    
# Check whether the function bed dictionary works (it does)
pprint.pprint(feature_bed_dict)

In [None]:
%%bash
# Make a subset of windows from pcontig_019 as a test dataset
cd /home/anjuni/analysis/windows/
for x in *.bed
do
len=${#x}
name=${x::len-4}
echo ${name}
grep 'pcontig_019' ${x} > test_windows/${name}.pcontig_019.bed
done

In [None]:
# Make a filepath dictionary and a bed file dictionary of the test windows
test_window_fn_dict = {}
test_window_fn_dict['100kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed')
test_window_fn_dict['10kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w10kb.pcontig_019.bed')
test_window_fn_dict['30kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w30kb.pcontig_019.bed')

test_window_bed_dict = {}
for key, value in test_window_fn_dict.items():
    test_window_bed_dict[key] = BedTool(value)

pprint.pprint(test_window_bed_dict)

In [None]:
# Make a dictionary of sliding windows
test_sliding_window_fn_dict = {}
test_sliding_window_fn_dict['100kb_s20kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w100kb_s20kb.pcontig_019.bed')
test_sliding_window_fn_dict['10kb_s2kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w10kb_s2kb.pcontig_019.bed')
test_sliding_window_fn_dict['30kb_s6kb'] = os.path.join(DIRS['WINDOW_OUTPUT'], 'test_windows', 'Pst_104E_v13_ph_ctg_w30kb_s6kb.pcontig_019.bed')

test_sliding_window_bed_dict = {}
for key, value in test_sliding_window_fn_dict.items():
    test_sliding_window_bed_dict[key] = BedTool(value)
    
pprint.pprint(test_sliding_window_bed_dict)

In [None]:
%%bash
# Test out overlaps for test dataset on command line, to see what output looks like (works)
cd /home/anjuni/analysis/windows/test_windows
features=/home/anjuni/analysis/gff_output
methyl=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA
ont_6mA_100kb=100kb_6mA_hc_tombo_0.10.bed
pb_6mA_100kb=100kb_6mA_prob_smrtlink_0.10.bed

coverageBed -a Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed -b ${methyl}/6mA_hc_tombo_sorted.cutoff.0.10.bed > 100kb_6mA_hc_tombo_0.10.bed

In [None]:
%%bash
# Test out the histogram function in coverageBed, to see what output looks like. No need to use it
cd /home/anjuni/analysis/windows/test_windows
features=/home/anjuni/analysis/gff_output
methyl=/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/cutoffs_6mA
coverageBed -a Pst_104E_v13_ph_ctg_w100kb.pcontig_019.bed -b ${methyl}/6mA_hc_tombo_sorted.cutoff.0.10.bed -hist > h100kb_6mA_hc_tombo_0.10.bed

# It just puts a row for all(?) at the bottom?

### <span style='color:#148aff'> 5.C Testing out Ben's pybedtools coverage function. <span/>

In [None]:
# Test out Ben's function to see if it's easier?
# make a dataframe to put headings
# (the function kwarg .coverage(F=0.1) indicates minimum fraction overlap)
tmp_df = test_window_bed_dict['100kb'].coverage(test_feature_fn_dict['ont_6mA_0.10']).to_dataframe().iloc[:,[0,1,2,3,6]]

In [None]:
# check dataframe
tmp_df.head()

In [None]:
# rename headings
tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True)
tmp_df.head()

In [None]:
# change output file path
tmp_fn = test_feature_fn_dict['ont_6mA_0.10'].replace('.bed', '.%s.overlap.bed' % '100kb')
tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
print(tmp_fn)

In [None]:
# make a dictionary for overlap file name as key and dataframe as value
feature_overlap_df_dict = {}
feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df
pprint.pprint(feature_overlap_df_dict)

In [None]:
# save to a csv (note: pybedtools has more decimal places than bash bedtools)
tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # no headers or row names in csv

### <span style='color:#148aff'> 5.D Using Ben's pybedtools coverage function on test window dataset. <span/>

In [None]:
# Make dictionary of test feature files to make the function easier
# The necessary feature files were moved to their own folder first
# Will need to move all feature files to a folder for actual analysis

DIRS['TEST_COV'] = os.path.join(DIRS['BASE2'], 'coverage/test_feature_files')
test_feature_fn_dict = {}
test_feature_fn_dict['genes'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.anno.sorted.gff3')
test_feature_fn_dict['TE'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.TE.sorted.gff3')
test_feature_fn_dict['effector'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.effectors.gff3')
test_feature_fn_dict['ont_6mA_0.10'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.10.bed')
test_feature_fn_dict['pb_6mA_0.10'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.10.bed')
test_feature_fn_dict['tmb_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.10.bed')
test_feature_fn_dict['tmb_cpg_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.10.bed')
test_feature_fn_dict['np_5mC_0.10'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.10.bed')

# test dict
pprint.pprint(test_feature_fn_dict)

In [None]:
# Make a dictionary of bed test feature files and view
test_feature_bed_dict = {}
for key, value in test_feature_fn_dict.items():
    test_feature_bed_dict[key] = BedTool(value)

pprint.pprint(test_feature_bed_dict)

In [None]:
# so the gff3 files don't get renamed properly, so I may need to make a new loop for them, or add an "if" line for them
test_feature_overlap_df_dict = {}
for wkey, wbed in test_window_bed_dict.items():
    for fkey, fbed in test_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        if tmp_fn.endswith('.bed'):
            tmp_fn = test_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        if tmp_fn.endswith('.gff3'):
            tmp_fn = test_feature_fn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
        test_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)

In [None]:
# The methylation files were too low-quality, so coverage for higher quality methylation data was collected :)
hc_feature_fn_dict = {}
hc_feature_fn_dict['ont_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['ont_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.50.bed') 
hc_feature_fn_dict['pb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['pb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_cpg_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.50.bed')
hc_feature_fn_dict['np_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.50.bed')
hc_feature_fn_dict['tmb_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.90.bed')
hc_feature_fn_dict['tmb_cpg_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.90.bed')
hc_feature_fn_dict['np_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.90.bed')                                          

In [None]:
hc_feature_bed_dict = {}
for key, value in hc_feature_fn_dict.items():
    hc_feature_bed_dict[key] = BedTool(value)

pprint.pprint(hc_feature_bed_dict)

In [None]:
# run coverage for higher stringency files
hc_test_feature_overlap_df_dict = {}
for wkey, wbed in test_window_bed_dict.items():
    for fkey, fbed in hc_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        tmp_fn = hc_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_coverage')
        hc_test_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)

In [None]:
# Running coverage for sliding windows
# Make dictionary of test feature files to make the function easier
# The necessary feature files were moved to their own folder first
# Will need to move all feature files to one folder for actual analysis

DIRS['TEST_COV'] = os.path.join(DIRS['BASE2'], 'coverage/test_feature_files')
test_sliding_feature_fn_dict = {}
test_sliding_feature_fn_dict['genes'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.anno.sorted.gff3')
test_sliding_feature_fn_dict['TE'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.TE.sorted.gff3')
test_sliding_feature_fn_dict['effector'] = os.path.join(DIRS['TEST_COV'], 'Pst_104E_v13_ph_ctg.effectors.gff3')
test_sliding_feature_fn_dict['tmb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_hc_tombo_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['tmb_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['tmb_cpg_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.90.bed')
test_sliding_feature_fn_dict['np_5mC_0.90'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['tmb_cpg_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_tombo_sorted.CpG.cutoff.0.50.bed')
test_sliding_feature_fn_dict['np_5mC_0.50'] = os.path.join(DIRS['TEST_COV'], '5mC_hc_nanopolish_sorted.cutoff.0.50.bed')
test_sliding_feature_fn_dict['pb_6mA_0.90'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.90.bed')
test_sliding_feature_fn_dict['pb_6mA_0.50'] = os.path.join(DIRS['TEST_COV'], '6mA_prob_smrtlink_sorted.cutoff.0.50.bed')

In [None]:
# Convert file paths to bed files
test_sliding_feature_bed_dict = {}
for key, value in test_sliding_feature_fn_dict.items():
    test_sliding_feature_bed_dict[key] = BedTool(value)

pprint.pprint(test_sliding_feature_bed_dict)

In [None]:
test_sliding_feature_overlap_df_dict = {}
for wkey, wbed in test_sliding_window_bed_dict.items():
    for fkey, fbed in test_sliding_feature_bed_dict.items():
        tmp_df = wbed.coverage(fbed).to_dataframe().iloc[:,[0,1,2,3,6]] # make a dataframe to put headings
        tmp_df.rename(columns={'name': 'overlap_count', 'thickStart': 'overlap_fraction'}, inplace=True) # rename headings
        if tmp_fn.endswith('.bed'): # for methylation files
            tmp_fn = test_sliding_feature_fn_dict[fkey].replace('.bed', '.%s.overlap.bed' % wkey) # change output file path
        if tmp_fn.endswith('.gff3'): # for gene/transposon/effectors file
            tmp_fn = test_sliding_feature_fn_dict[fkey].replace('.gff3', '.%s.overlap.bed' % wkey) # change output file path
        tmp_fn = tmp_fn.replace('test_feature_files', 'test_sliding_coverage')
        test_sliding_feature_overlap_df_dict[tmp_fn.split('/')[-1]] = tmp_df # file name as key and dataframe as value for overlap dict
        tmp_df.to_csv(tmp_fn, sep='\t', header=None, index=None) # save to a csv(pybedtools outputs more d.p. than BEDTools)