# <span style="color:violet"> Methylation analysis data prep text</span>

<span style="color:orange">Converting tombo and nanopolish outputs into gff format for easy use with gene model files. text</span>

1. Open wig files and view them.
2. Convert each individual WIG into a BED.
3. Combine or append the two 5mC GFF's and the two 6mA GFF's, as there are separate GFF's for the (+) and (-) strands.
4. Put the contig number and chromosome position in ascending order.
    -This is because the (-) strand file will just be added to the bottom of the (+) strand file. 

In [1]:
import os
import pysam
import pandas as pd
import glob
import wiggelen
from Bio import SeqIO

In [2]:
#First we need to define the base dirs
DIRS ={}
DIRS['BASE'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['WIG_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'tombo_wig' )
DIRS['TSV_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'nanopolish_tsv') 
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'smrtlink_gff')
DIRS['BED_OUT'] = os.path.join(DIRS['BASE'], 'input', 'bed_files')

In [3]:
#Quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)
    else:
        print(value)

/home/anjuni/methylation_calling/pacbio
/home/anjuni/methylation_calling/pacbio/input/tombo_wig
/home/anjuni/methylation_calling/pacbio/input/nanopolish_tsv
/home/anjuni/methylation_calling/pacbio/input/smrtlink_gff
/home/anjuni/methylation_calling/pacbio/input/bed_files


In [128]:
#Define functions
def wig_to_bed(input_wig):
    """Convert WIG files from tombo into BED files for BEDtools analysis."""
    if len(input_wig) == 92: # The separate WIG files for the (-) and (+) strands have file handles of different lengths.
        mod = input_wig[56:59] # Files for the same strand are for 6mA and 5mC modifications, so need to separate these.
        strand = '+'
        name = mod + '_plus' + '_tombo' +'.bed'
        out_bed = os.path.join(DIRS['BED_OUT'], name)
            
    else:
        mod = input_wig[56:59]
        strand = '-'
        name = mod + '_minus' + '_tombo' +'.bed'
        out_bed = os.path.join(DIRS['BED_OUT'], name)      
        
    with open(input_wig) as wig_file:   
        with open(out_bed, 'w') as bed_file:
            for x in wiggelen.walk(wig_file):
                    print('%s\t%d\t%d\t%s\t%s\t%s' % (x[0], int(x[1])-1, x[1], mod, x[2], strand), file=bed_file)

In [131]:
def tsv_to_bed(input_tsv):
    """Convert TSV files from nanopolish into BED files for BEDtools analysis."""
    # Sites with no methylation (frequency = 0.00) are also included, as the TSV file has all CpG sites as possible 5mC sites.
    # We need CpG sites to determine methylation frequency.
    with open(input_tsv) as tsv_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '5mC_nanopolish.bed') # Maybe change this to CpG_nanopolish.bed, depending on analysis.
        with open(output_bed, 'w') as bed_file:
            for line in tsv_file:
                if line.startswith('chr'): # first line in tsv has column headings (eg: chr), so ignore this line
                    continue
                else:
                    line = line.rstrip()
                    new = line.split('\t')
                    print('%s\t%s\t%s\t%s\t%s\t%s' % (new[0], int(new[1])-1, new[2], '5mC', new[6], '+'), file=bed_file)

In [156]:
def tsv_to_bed_hc(input_tsv):
    """Convert only high confidence methylation sites (non-zero frequency) from TSV files from nanopolish into BED files for BEDtools analysis."""
    with open(input_tsv) as tsv_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '5mC_hc_nanopolish.bed')
        with open(output_bed, 'w') as bed_file:
            for line in tsv_file:
                if line.startswith('chr'):
                    continue
                else:
                    line = line.rstrip()
                    line = line.split('\t')
                    if line[6] == '0.000':
                        continue
                    else:
                        print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[1])-1, line[2], '5mC', line[6], '+'), file=bed_file)

In [168]:
def gff_to_bed(input_gff):
    """Convert GFF files from SMRTLink into BED files for BEDtools analysis."""
    with open(input_gff) as gff_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '6mA_smrtlink.bed')
        with open(output_bed, 'w') as bed_file:
            for line in gff_file:
                if line.startswith('#'):
                    continue
                else:
                    line = line.rstrip()
                    line = line.split('\t')
                    if line[2] == 'modified_base': # only select identified  6mA sites, not general modifications
                        continue
                    else:
                        print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[3])-1, line[3], '6mA', line[5], line[6]), file=bed_file)

In [133]:
#Make file handles for each wig and tsv file and output folder

#Wig files from tombo
m5c_plus_fh = os.path.join(DIRS['WIG_INPUT'], '5mC.fraction_modified_reads.plus.wig')
m5c_minus_fh = os.path.join(DIRS['WIG_INPUT'], '5mC.fraction_modified_reads.minus.wig')
m6a_plus_fh = os.path.join(DIRS['WIG_INPUT'], '6mA.fraction_modified_reads.plus.wig')
m6a_minus_fh = os.path.join(DIRS['WIG_INPUT'], '6mA.fraction_modified_reads.minus.wig')

#Make a directory of wig file handles for easy use in the function
wig_fh_list = [m5c_plus_fh, m5c_minus_fh, m6a_plus_fh, m6a_minus_fh]

#tsv files from nanopolish
m5c_nanopolish_fh = os.path.join(DIRS['TSV_INPUT'], 'Pst_104E_methylation_frequency_minimap2.tsv')

#gff files from smrtlink
gff_smrt_fh = os.path.join(DIRS['GFF_INPUT'], 'file.gff')

In [5]:
#Check if this list works
print(wig_fh_list)

['/home/anjuni/methylation_calling/pacbio/input/tombo_wig/5mC.fraction_modified_reads.plus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/5mC.fraction_modified_reads.minus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/6mA.fraction_modified_reads.plus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/6mA.fraction_modified_reads.minus.wig']


In [129]:
#Convert all WIG files into BED format
for file in wig_fh_list:
    wig_to_bed(file)

In [132]:
#Convert the TSV file to BED format
tsv_to_bed(m5c_nanopolish_fh)

In [157]:
#Convert high-confidence valeus in the TSV file to BED format
tsv_to_bed_hc(m5c_nanopolish_fh)

In [169]:
#Convert the GFF file to BED format
gff_to_bed(gff_smrt_fh)

# <span style="color:violet"> Test functions for the first 10 lines of input files text</span>

In [123]:
def wig_to_bed_test(input_wig):
    """Test WIG to BED conversion for first 10 lines of WIG files."""
    if len(input_wig) == 92:
        mod = input_wig[56:59]
        strand = '+'
        name = mod + '_plus' + '_tombo' +'.bed'
        out_bed = os.path.join(DIRS['BED_OUT'], name)
            
    else:
        mod = input_wig[56:59]
        strand = '-'
        name = mod + '_minus' + '_tombo' +'.bed'
        out_bed = os.path.join(DIRS['BED_OUT'], name)      
        
    with open(input_wig) as wig_file:   
        with open(out_bed, 'w') as bed_file:
            count=0
            for x in wiggelen.walk(wig_file):
                if count < 10:
                    print('%s\t%d\t%d\t%s\t%s\t%s' % (x[0], int(x[1])-1, x[1], mod, x[2], strand), file=bed_file)
                    count += 1
                else:
                    break

In [None]:
for file in wig_fh_list:
    %prun wig_to_bed_test(file) # works!

In [None]:
def tsv_to_bed_test(input_tsv):
    """Test TSV to BED conversion for first 10 lines of the TSV file."""
    with open(input_tsv) as tsv_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '5mC_nanopolish.bed')
        with open(output_bed, 'w') as bed_file:
            count = 0
            for line in tsv_file:
                if line.startswith('chr'):
                    continue
                else:
                    if count < 10:
                        line = line.rstrip()
                        line = line.split('\t')
                        print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[1])-1, line[2], '5mC', line[6], '+'), file=bed_file)
                        count += 1 
                    else:
                        break

In [124]:
tsv_to_bed_test(m5c_nanopolish_fh) # works!

    

In [152]:
def tsv_to_bed_hc_test(input_tsv):
    """Test TSV to BED conversion for first 10 lines of the TSV file."""
    with open(input_tsv) as tsv_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '5mC_hc_nanopolish.bed')
        with open(output_bed, 'w') as bed_file:
            count = 0
            for line in tsv_file:
                if line.startswith('chr'):
                    continue
                else:
                    line = line.rstrip()
                    line = line.split('\t')
                    if line[6] == '0.000':
                        continue
                    else:
                        if count < 10:
                            print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[1])-1, line[2], '5mC', line[6], '+'), file=bed_file)
                            count += 1 
                        else:
                            break

In [153]:
tsv_to_bed_hc_test(m5c_nanopolish_fh) # works!

In [150]:
#This works, but I want to convert the Phred score into a probability value.
#This script writes the Phred score to a BED file, while another script will write a probability value to a BED file.
def gff_to_bed_test(input_gff):
    """Test GFF to BED conversion for first 10 lines of the GFF file."""
    with open(input_gff) as gff_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '6mA_smrtlink.bed')
        with open(output_bed, 'w') as bed_file:
            count = 0
            for line in gff_file:
                if line.startswith('#'):
                    continue
                else:
                    line = line.rstrip()
                    line = line.split('\t')
                    if line[2] == 'modified_base': # only select identified  6mA sites, not general modifications
                        continue
                    else:
                        if count < 10:
                            print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[3])-1, line[3], '6mA', line[5], line[6]), file=bed_file)
                            count += 1
                        else:
                            break

In [151]:
gff_to_bed_test(gff_smrt_fh) # works!

In [195]:
#write a probability value (instead of a Phred score) to a BED file.
#issue: set the number of decimal places :)
def gff_prob_to_bed_test(input_gff):
    """Test GFF to BED conversion for first 10 lines of the GFF file."""
    with open(input_gff) as gff_file:
        output_bed = os.path.join(DIRS['BED_OUT'], '6mA_prob_smrtlink.bed')
        with open(output_bed, 'w') as bed_file:
            count = 0
            for line in gff_file:
                if line.startswith('#'):
                    continue
                else:
                    line = line.rstrip()
                    line = line.split('\t')
                    if line[2] == 'modified_base': # only select identified  6mA sites, not general modifications
                        continue
                    else:
                        if count < 10:
                            print('%s\t%s\t%s\t%s\t%s\t%s' % (line[0], int(line[3])-1, line[3], '6mA', 1 - (10**((-(float(line[5])))/10)), line[6]), file=bed_file)
                            count += 1
                        else:
                            break

In [196]:
gff_prob_to_bed_test(gff_smrt_fh) #works! just figure out decimal places