# <span style="color:plum"> BEDtools data prep. </span>

This is a script to filter BED files of methylation sites using different cutoffs.

1. Make a list of cutoffs.
2. Run this list of cutoffs through each bed file to filter them.

In [23]:
import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
import numpy # need fro p-value stats
import scipy

In [24]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE1'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['BASE2'] = '/home/anjuni/analysis'
DIRS['BED_INPUT'] = os.path.join(DIRS['BASE1'], 'input', 'sorted_bed_files')
DIRS['BED_OUTPUT'] = os.path.join(DIRS['BED_INPUT'], 'filtered_bed')

In [25]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [48]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]

In [49]:
#Check that the list works
print(*bed_file_list, sep='\n')

/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/5mC_tombo_sorted.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/5mC_nanopolish_sorted.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/6mA_tombo_sorted.bed
/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/6mA_prob_smrtlink_sorted.bed


In [44]:
# Make the list of cutoffs
cutoff_list = [1, 0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]

In [38]:
# define function to filter
def score_filter(feature, L):
    "Returns True if feature is longer than L"
    return float(feature.score) >= L

def filter_by_cutoffs(bed_files, cutoffs):
    for file in bed_files:
        pybed_object = BedTool(file)
        for x in cutoffs:
            filtered_file = pybed_object.filter(score_filter, x)
            cutoff_name = '.cutoff.' + str(x) + '.bed'
            out_filename = file.replace('.bed', cutoff_name)
            out_filename = out_filename.replace('/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/', '/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/')
            filtered_file.saveas(out_filename)

In [None]:
filter_by_cutoffs(bed_file_list, cutoff_list)

In [42]:
# Testing out the function on one file
bed_file_list = ['/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/6mA_prob_smrtlink_sorted.bed']

for file in bed_file_list:
    pybed_object = BedTool(file)
    for x in cutoffs:
        filtered_file = pybed_object.filter(score_filter, x)
        cutoff_name = '.cutoff.' + str(x) + '.bed'
        out_filename = file.replace('.bed', cutoff_name)
        out_filename = out_filename.replace('/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/', '/home/anjuni/methylation_calling/pacbio/input/sorted_bed_files/filtered_bed/')
        filtered_file.saveas(out_filename)