# <span style='color:#ff4747'> Sorting_all_files_and_filtering_Tombo_files <span/>

### This is a script of file conversions to go between the 'Methylation_data_prep' notebook and the 'BEDTools_data_prep' notebook.

The steps in this script are:

1. Sort all bed files
2. Filter tombo files to make high-confidence files with no zero-probability sites.

### <span style='color:#ff7547'> 1. Sort all bed files <span/>

In [None]:
%%bash

#sort all bed files produced by 'Methylation_data_prep' notebook

#go to the folder containing all bed files.
cd ~/methylation_calling/pacbio/input/bed_files
#contains: 5mC_nanopolish.bed , 5mC_hc_nanopolish.bed , 6mA_tombo.bed , 5mC_tombo.bed , 6mA_prob_smrtlink.bed

for bed in *.bed;do echo ${bed};len=${#bed};sortBed -i ${bed} >  ${bed:0:len-4}_sorted.bed; done

mv *sorted.bed ~/methylation_calling/pacbio/input/sorted_bed_files

### <span style='color:#ffa347'> 2. Filter tombo files to make high-confidence files with no zero-probability sites. <span/>

In [None]:
import pybedtools
from pybedtools import BedTool
import os
import glob
import pprint
import numpy # needed for last few bedtools functions
import scipy

In [None]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['BED_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'sorted_bed_files')

In [None]:
#output files for trying to filter the tombo files
hc_tombo_m5c = DIRS['BED_OUT'] = os.path.join(DIRS['BASE'], 'output', 'intersected_bed_files', '5mC_hc_tombo_sorted.bed')
hc_tombo_m6a = DIRS['BED_OUT'] = os.path.join(DIRS['BASE'], 'output', 'intersected_bed_files', '6mA_hc_tombo_sorted.bed')

In [None]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)
    else:
        print(value)

In [None]:
#Make filepaths
bed_file_list = [fn for fn in glob.iglob('%s/*.bed' % DIRS['BED_INPUT'], recursive=True)]

In [None]:
#Check that the list works
print(*bed_file_list, sep='\n')

In [None]:
# Using a dictionary to make a list of bed files
BED = {}
for file in bed_file_list:
    name = (file[63:-4])
    bed_file = BedTool(file)
    BED[name] = bed_file

In [None]:
# see if dictionary works
pprint.pprint(BED)

In [None]:
# define function to filter
def score_filter(feature, L):
    "Returns True if feature is longer than L"
    return float(feature.score) > L

In [None]:
# filter out scores that are zero
filtered_tombo_m5c = BED['5mC_tombo_sorted'].filter(score_filter, 0)
filtered_tombo_m6a = BED['6mA_tombo_sorted'].filter(score_filter, 0)

In [None]:
# save to a file
filtered_tombo_m5c.saveas(hc_tombo_m5c)
filtered_tombo_m6a.saveas(hc_tombo_m6a)