# Methylation analysis data prep

Converting tombo and nanopolish outputs into gff format for easy use with gene model files.

1. Open wig files and view them.
2. Convert each individual wig into a gff.
3. Combine or append the two 5mc gff's and the two 6ma gff's, as there are separate gff's for the (+) and (-) strands.
4. Put the contig number and chromosome position in ascending order.
    -This is because the (-) strand file will just be added to the bottom of the (+) strand file. 

In [136]:
import os
import pysam
import pandas as pd
import glob
from Bio import SeqIO

In [140]:
#first we need to define the base dirs
DIRS ={}
DIRS['BASE'] = '/home/anjuni/methylation_calling/pacbio'
DIRS['WIG_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'tombo_wig' )
DIRS['TSV_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'nanopolish_tsv') 
DIRS['GFF_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'smrtlink_gff')
DIRS['BED_OUT'] = os.path.join(DIRS['BASE'], 'input', 'bed_files')

In [141]:
#quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)
    else:
        print(value)

/home/anjuni/methylation_calling/pacbio
/home/anjuni/methylation_calling/pacbio/input/tombo_wig
/home/anjuni/methylation_calling/pacbio/input/nanopolish_tsv
/home/anjuni/methylation_calling/pacbio/input/smrtlink_gff
/home/anjuni/methylation_calling/pacbio/input/bed_files


In [30]:
#define functions


def wig_to_gff(input_wig_file, output_gff_file):
    """Convert WIG files from tombo into gff files for BEDtools analysis."""
    
def tsv_to_gff(input_tsv_file, output_gff_file):
    """Convert TSV files from nanopolish into gff files for BEDtools analysis."""
    
    
    

In [31]:
### Do something in this format
#the outfile for the fastq files mapping to pcontig_019
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

#we generate an new file and write out all the aligned reads in fastq format
#we added in an save guard to save out each read only once as it appears that pysam provides some reads in duplicate.
saved_reads = []
with open(fastq_out_fh, mode='w') as fastq_out:
    for read in contig_19_reads_in_bam:
        if read.query_name not in saved_reads:
            print('@%s' % read.query_name, file=fastq_out)
            print('%s' % read.get_forward_sequence(), file=fastq_out)
            print('+', file=fastq_out)
            print('%s' % read.get_forward_qualities(), file=fastq_out)
            saved_reads.append(read.query_name)

KeyError: 'FASTQ_OUT'

In [36]:
#make file handles for each wig and tsv file and output folder

#wig files from tombo
m5c_plus_fh = os.path.join(DIRS['WIG_INPUT'], '5mC.fraction_modified_reads.plus.wig')
m5c_minus_fh = os.path.join(DIRS['WIG_INPUT'], '5mC.fraction_modified_reads.minus.wig')
m6a_plus_fh = os.path.join(DIRS['WIG_INPUT'], '6mA.fraction_modified_reads.plus.wig')
m6a_minus_fh = os.path.join(DIRS['WIG_INPUT'], '6mA.fraction_modified_reads.minus.wig')

#make a directory of wig file handles
wig_fh_list = [m5c_plus_fh, m5c_minus_fh, m6a_plus_fh, m6a_minus_fh]

#tsv files from nanopolish
m5c_nanopolish_fh = os.path.join(DIRS['TSV_INPUT'], 'Pst_104E_methylation_frequency_minimap2.tsv')

#output folders
#may need to find a way to make a list of these and use this list in the conversion function
#or don't use this at all. do everything in the function.
#gff_fh = os.path.join(DIRS['GFF_OUT'], 'ref_pcontig_019.fasta')


In [33]:
#check if this works
print(wig_fh_list)

['/home/anjuni/methylation_calling/pacbio/input/tombo_wig/5mC.fraction_modified_reads.plus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/5mC.fraction_modified_reads.minus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/6mA.fraction_modified_reads.plus.wig', '/home/anjuni/methylation_calling/pacbio/input/tombo_wig/6mA.fraction_modified_reads.minus.wig']


In [74]:
f  =open(m5c_minus_fh)

In [80]:
count = 0
for line in f:
    if count < 14:
        print(line)
        count += 1 
    else:
        break

245 0.4167

248 0.4167

250 0.5

254 0.1667

257 0.3333

259 0.3333

260 0.3333

264 0.25

268 0.1667

269 0.1667

276 0.3333

285 0.5833

290 0.5

291 0.5



In [81]:
f.close()

In [128]:
bed_nanopolish_out_fn = os.path.join(DIRS['BED_OUT'], 'nanopolish.bed')

In [132]:
#test first ten lines to convert tsv to bed
#it worked, so clean up the code, write is as a function and test the first 10 lines again
#then run on the whole file
with open(m5c_nanopolish_fh) as myfile:
    with open(bed_nanopolish_out_fn, 'w') as out_fh:
        count = 0
        for line in myfile:
            if line.startswith('chr'):
                continue
            else:
                if count < 11:
                    line = line.rstrip()
                    new = line.split('\t')
                    print('%s\t%s\t%s\t%s\t%s\t%s' % (new[0], int(new[1])-1, new[2], '5mC_nano', new[6], '+'), file=out_fh)
                    count += 1 
                else:
                    break

In [43]:
!head {m5c_nanopolish_fh}

chromosome	start	end	num_cpgs_in_group	called_sites	called_sites_methylated	methylated_frequency	group_sequence
hcontig_000_003	42	42	1	16	0	0.000	TCCAGCGCTAT
hcontig_000_003	148	148	1	22	0	0.000	split-group
hcontig_000_003	157	157	1	22	0	0.000	split-group
hcontig_000_003	167	167	1	22	0	0.000	split-group
hcontig_000_003	175	175	1	22	0	0.000	split-group
hcontig_000_003	182	182	1	22	0	0.000	split-group
hcontig_000_003	200	200	1	17	0	0.000	split-group
hcontig_000_003	202	202	1	17	0	0.000	split-group
hcontig_000_003	215	215	1	16	0	0.000	split-group


In [86]:
print(myfile)

<_io.TextIOWrapper name='/home/anjuni/methylation_calling/pacbio/input/nanopolish_tsv/Pst_104E_methylation_frequency_minimap2.tsv' mode='r' encoding='UTF-8'>


In [48]:
tsv = m5c_nanopolish_fh
output = test.gff

for x in tsv:
    print(x)

/
h
o
m
e
/
a
n
j
u
n
i
/
m
e
t
h
y
l
a
t
i
o
n
_
c
a
l
l
i
n
g
/
p
a
c
b
i
o
/
i
n
p
u
t
/
n
a
n
o
p
o
l
i
s
h
_
t
s
v
/
P
s
t
_
1
0
4
E
_
m
e
t
h
y
l
a
t
i
o
n
_
f
r
e
q
u
e
n
c
y
_
m
i
n
i
m
a
p
2
.
t
s
v


In [52]:
!head {m5c_minus_fh}

track type=wiggle_0 name="5mC_fraction_modified_reads_rev_strand" description="5mC fraction_modified_reads rev_strand"
variableStep chrom=pcontig_005 span=1
16 0.25
17 0.25
20 0.375
22 0.375
25 0.5
31 0.4444
37 0.1818
39 0.2727


In [54]:
import wiggelen

In [152]:
#test first 10 lines to convert wig to bed
#first convert to a text file to get all the contig and position data out
#it is hard to directly put the contig number next to the position in a bed file
#could just read wiggelen documentation and find a way to put the contig name in front of every position number
#remember to minus 1 from the start position as bed is 0-based
#maybe wig is 0-based too. check that first.
out_fn = os.path.join(DIRS['BED_OUT'], 'm5c_minus.txt')
with open(out_fn, 'w') as out_fh:
    count=0
    for x in wiggelen.walk(open(m5c_minus_fh)):
        if count < 10:
            print('%s\t%d\t%s' % x, file=out_fh)
            count += 1
        else:
            break