### Preparation for running Nanopore methylation calling on NCI

This script does the following:

- pull out all aligned fastq for each contig using previously mapped Nanopores reads with Minimap2
- pack them up 
- pull out all corresponding fast5 files and pack them up

In [3]:
!pip install pysam



In [4]:
import os
import pysam
import pandas as pd
import glob
import tarfile #compress fast5
from Bio import SeqIO

In [5]:
#first we need to define the base dirs
DIRS ={}
#DIRS['BASE'] = '/home/ap/mock_up/methylation_calling/nanopore' #home computer. hash out later
DIRS['BASE'] = '/home/anjuni/methylation_calling/nanopore' #fisher
DIRS['BAM_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'minimap2_alignments' )
DIRS['FAST5_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'all_fast5') #fix for all runs and copy from nci to right directory
DIRS['FASTQ_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fastq')
DIRS['FAST5_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fast5')
DIRS['REF_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_ref')
#fix this here for reference
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/' # Pst_104E_v13_ph_ctg.fa

In [6]:
#define functions

#to compress the fast5 reads mapping contig 19 to a tar.gz file
def make_tarfile(output_filename, file_list):
    with tarfile.open(output_filename, "w:gz") as tar:
        for file in file_list:
            tar.add(file, arcname=os.path.basename(file))

In [7]:
seq_sum_albacore_fh = os.path.join(DIRS['FAST5_INPUT'], 'albacore_fastq/Pst79_run1-4_1d_sequencing_summary.txt')
#only read in the first two columns instead of everything (file name and read ID)
seq_sum_df = pd.read_csv(seq_sum_albacore_fh, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


seq_sum_df.head # check if it shows the file name and read ID

In [40]:
#quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [41]:
#we generated the BAM file handle (file path)
bam_fh = os.path.join(DIRS['BAM_INPUT'], 'Pst79_run1_1d_pass.minimap2.out.bam')

In [42]:
#just an example to loop over content of a folder
for x in os.listdir(DIRS['BAM_INPUT']):
    if x.endswith('.bam'):
        print(x)

Pst79_run1_1d_pass.minimap2.out.bam


In [43]:
#make a list of paths for BAM files
bam_fh_list = [os.path.join(DIRS['BAM_INPUT'], x) for x in os.listdir(DIRS['BAM_INPUT']) if x.endswith('.bam')]

######fix this here for reference. use the same as for mapping the long reads
reference_fh = os.path.join(DIRS['REF'], 'Pst_104E_v13_ph_ctg.fa')

split_reference_fh = os.path.join(DIRS['REF_OUT'], 'ref_pcontig_019.fasta') #output folder

In [33]:
print(split_reference_fh)

/home/anjuni/methylation_calling/nanopore/input/split_ref/ref_pcontig_019.fasta


In [44]:
#now we 'read' in an bam file. Really we generated an AlignmentFile object 
bam_file = pysam.AlignmentFile(bam_fh, "rb")

In [45]:
for seq in  SeqIO.parse(reference_fh, 'fasta'):
    if seq.id == 'pcontig_019':
        SeqIO.write(seq, split_reference_fh, 'fasta')

In [46]:
#get all the reads for contig_19
contig_19_reads = [read for read in bam_file.fetch(contig='pcontig_019')]

#this provides the name for all references in the bam file
bam_file.references

In [47]:
#the outfile for the fastq files mapping to pcontig_019
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

In [48]:
#we generate an new file and write out all the aligned reads in fastq format
#we added in an save guard to save out each read only once as it appears that pysam provides some reads in duplicate.
saved_reads = []
with open(fastq_out_fh, mode='w') as fastq_out:
    for read in contig_19_reads:
        if read.query_name not in saved_reads:
            print('@%s' % read.query_name, file=fastq_out)
            print('%s' % read.query, file=fastq_out)
            print('+', file=fastq_out)
            print('%s' % read.qqual, file=fastq_out)
            saved_reads.append(read.query_name)

In [49]:
#we briefly check the longest aligned read
max_lenght = 0
with pysam.FastxFile(fastq_out_fh) as fh:
    for entry in fh:
        if len(entry.sequence) > max_lenght:
            max_lenght = len(entry.sequence)
print('Longest alinged read to contig_19 is %i long'  % max_lenght)

Longest alinged read to contig_19 is 60786 long


In [50]:
#reduce the size of the dataframe to only contain the filename and read_id column thats all we need
small_df = seq_sum_df.iloc[:, [0,1]].copy()

In [51]:
small_df.head()

Unnamed: 0,filename,read_id
0,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,12756ced-c758-42f7-a359-ba477a20b3a1
1,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,15749e40-174c-4bd3-9341-8cb37fde491b
2,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,23703e5f-144e-405b-aba1-b50ce853ecd9
3,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,24dd4f66-0a77-4a45-8b5f-8beb2d1576b0
4,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,2f4921ba-e9a3-44da-90d0-53815af5fcd7


In [52]:
#get all the fast5 names for the reads that map to congtig 19
fast5_names_contig_19 = list(small_df[small_df.read_id.isin(saved_reads)]['filename'])

In [53]:
fast5_names_contig_19[0] #check first element

'rsb0001410_20170719_FAH05512_MN21513_mux_scan_Pst79_1_10K_47691_read_101_ch_109_strand.fast5'

In [54]:
#this looks for fast5s recursively in all the Fast5_input folder
all_fast5s = [fn for fn in glob.iglob('%s/**/*.fast5' % DIRS['FAST5_INPUT'], recursive=True)]

In [55]:
#this gets the whole path of the fast5s that map to contig_19
contig_19_fast5s_fh = [x for x in all_fast5s if x.split('/')[-1] in fast5_names_contig_19]

In [56]:
len(contig_19_fast5s_fh)

570

In [62]:
fast5_names

NameError: name 'fast5_names' is not defined

In [59]:
len(fast5_names_contig_19)

1481

In [60]:
len(fast5_names)

NameError: name 'fast5_names' is not defined

In [63]:
seq_sum_df.columns

Index(['filename', 'read_id', 'run_id', 'channel', 'start_time', 'duration',
       'num_events', 'passes_filtering', 'template_start',
       'num_events_template', 'template_duration', 'num_called_template',
       'sequence_length_template', 'mean_qscore_template',
       'strand_score_template', 'calibration_strand_genome_template',
       'calibration_strand_identity_template',
       'calibration_strand_accuracy_template', 'aligned_speed_bps_template'],
      dtype='object')

In [64]:
seq_sum_df[seq_sum_df.read_id == read_1.query_name]['filename'].to_string().split(' ')[-1]

NameError: name 'read_1' is not defined

In [65]:
fast5_mapped = os.path.join(DIRS["FAST5_OUT"],'pcontig_019_aln_fast5.tar.gz')

In [None]:
#make iteration with all mapped fast5

In [66]:
make_tarfile(fast5_mapped, contig_19_fast5s_fh)

In [67]:
test_fastq = pysam.FastxFile(fastq_out_fh)

In [68]:
for entry in test_fastq:
    print(entry.name)

09acac21-2d15-4bf3-8bc2-ec3019a0dfd0
6ebcee2a-c476-40a8-8c88-aea182197e6d
e65f8dfc-7ebc-46c9-9979-92050a26ab6f
24b800ef-1d57-4e67-aa55-58e25a0ea239
63d910b7-e9d2-47da-8936-b633c2a1203b
bdb640c1-c386-4bd4-af8c-93dbb1b67051
e80fffda-9c3c-4eb1-8c99-e22b2dc6576c
292e8fa0-ddaf-40db-a31a-f3031cec3b41
8adaf86d-bc1a-4194-bb2b-0cafcc6653aa
f2520fc2-8002-49ad-a23f-1adbabe4b5bf
4e0428dd-189d-4dfe-903e-4a374ad712d0
ded0e71c-5e1a-4554-adb9-74b25d9517ca
b1a136d1-32ea-4076-9045-7f4634172e36
7243dbfa-9335-4622-8df5-55331e587142
126d57c2-28ca-4a41-8d3a-7196a131ab1e
a3676727-5877-496e-8d2b-133929fcb81d
979fb493-568f-41a3-90dd-933604d42e88
d619cf71-b45b-4d13-8cb7-c0efe1d5034e
198e5d3d-e458-47af-a337-89b31fff4b49
aa8ae8bf-035e-41d7-86c8-c7192b379d5f
84d39c13-7fca-4d4f-959c-36083ff1a683
8fd1d8b5-12c7-4033-a9ee-0258a0bf5636
e9fe4d3b-956d-4e32-92e0-58b5e2bd92b1
1e580fe2-cb58-4853-b565-a1f761887452
0ab6822e-653d-4a40-8f8a-da5b1c8c9a62
e1d93ea7-74a4-4644-9258-65f92e8d8bfa
1be55195-8f56-4860-9bc5-8254c1668b73
9

In [69]:
test_fastq.close()

In [70]:
read_1 = contig_19_reads[0]

In [71]:
len(read_1.seq) == len(read_1.qual)

True

read_1.qual

In [73]:
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

In [74]:
fastq_out = open(fastq_out_fh, mode='w')

In [75]:
fastq_out.write(read_1.header)

TypeError: write() argument must be str, not pysam.libcalignmentfile.AlignmentHeader

In [76]:
read_1.seq

'AGTTACGTATTGTTGGCGTCTGTTTGTGTTTAAACTGAGGTCCTAAGTCAGAGAGGTTCCTGGTACTCACTGGCAATACCCCCATATTCAAAGCATTCCCTTCACCTATTTGTCTACAGACCTCTCATTGAGAAAAATAGCACCTACAGTACCATAATTTAAGAGAGGGTGTAATAAATTCTCTCTCTGTTTTTGGTTGCCTATAACTTCCTCCTATTAATTGAGTTTTGTTGCTCACAGGCGTTGGAGCATCACAAGGCGGTTTTGCCAAGACTGCAACATTATTTTTCTCGACGTGGACCTGTGCGCGCAGCAAATCCTAAAAGCGCAAAGTGCATGTCCGCTTTTTTAAGACTTTTTTATTTTATTAATCAAGTCAGATGTTTCTGATTTTTCCCTCACTTTTATGTTCTCAAGCTTTTTCCCCACTTTTCTTTTTCTCTCTTTTGTTTCCCCCTGCCTTTTTCCTTTCTTCTCTCTTTTATTATTTCAGATTATCTCTTTTTTTTTATTTTACTTTTTCTTACTTATTTTTTTTTTCTTGTTTTTGCTGAGCTATATGCACAACTACATTTATTTTCATACACAACAACAAAAGTTGAAAGACAGAGTGTGTACCTTAAGATTTTGGAGGGTGTGCACCTGCTTCTTGGAGGCACACTCTGAGCACAATTTTTGGTGTAGCACACCTTGTGCACTCTTAAGAATCAGGAGTGCACAGTCCATGCACTCCATTTTGGAGGCACACTCCAGCACTCCAACCCAACATTTTTGGCAACTAATCTTGAGGCAGGATTGTGCACTCCAAGAATCTTGAGGTACATGTATTGTGTCGGCTTGTGTGCACTCCAACTTCCCCTTAGGCTTCGAGATCACATGAAGCCCTGATTTTCCCACCTTCCCGCCTTCAGCACAGCAAAAACCACATCCGCGAATAAGACATATCCAAGATATTAGAAGATATTGATCTCATTTGACTTTCTTTTATGATTCCATCAT

In [77]:
read_1.qual

'""\'&#"$##(*\'#\'\'(#-/*+*%)\'%$&&*-05*&#%%#$$(+&&%)))),*+)&.2*0\',)/---,./.-41*20(\'&,.,")%\'\'%#"$%"##"#&(())&+*%++-5+&(\'"#"#%&(+***-05-/(%+/2=4$7569843/)/+,,*%,\'4,548*/(**)53,%%$"$#(-(*&\'&\'&\'.134).35,(5\'3,3.-+,+/$#&\'+*+/2.8-6.5,+*\'$0/,&#$&##$%"-2-*,*)%\'$&++*)$%%$$$"$%"(($%\'\'+\'67643144\')*($(,.-($%$1\'%$#+,)+(#$#%$&%$")&&09)(1+#"(,)%"&&.1($"$"+)106+\'%*5<<;7.5+*(066:940=<<7):%8($\'%\'\',+.(\'$$#)($#)(3;90*,/,)(,$(+,)))(.).\'\'$#%%)2)(%-/+/1+775)+)/.**%\'&\'$)39/+.,)#%\'\')*,-))0--)%$%%%"&((\'%&$,59129.\'-(&"#%,//2\'#$$(7:889991**/2(&"\'34.)$)$+#/.*-9:>;=<<:0&*.(18<7+,)*0--&&#%%-..10:,-%$"##),(106,*/*.*)\'.+$/&&702,$%)""#$$*,-+(\'$***%,+,%%,/+.6:3*#&"&\'+%..2((\'(%\'#\'&%%+%-*-(%****&,/10-,)##$$(-7-%&$&%%&"/*-//)/%$))&%"##&,3+3//..30++)-$126446/-,*)&()\',))+:6(+(\'&%)()(%(&\'(\'\')%+.-5-2.+2(+,578221.800$+0)\',\'-(,,)*.--;/++))(\'$&%*2+7--*+\'*\'(*)***,++-4,+()+).\'$*)*(&%$$*&#))%("$$%\'))()&&+&(*&-\')&"%&#$#%\'(&&/5.+*-95//+#&#%(*()&%&&&$&$%$*+74/2<<<26545/.,%""$3-#716:8

In [78]:
read_1.header

<pysam.libcalignmentfile.AlignmentHeader at 0x7f4d7326f618>

In [79]:
read_1.qname

'09acac21-2d15-4bf3-8bc2-ec3019a0dfd0'

In [80]:
read_1.query_name

'09acac21-2d15-4bf3-8bc2-ec3019a0dfd0'

In [81]:
read_1.qqual

'44\')*($(,.-($%$1\'%$#+,)+(#$#%$&%$")&&09)(1+#"(,)%"&&.1($"$"+)106+\'%*5<<;7.5+*(066:940=<<7):%8($\'%\'\',+.(\'$$#)($#)(3;90*,/,)(,$(+,)))(.).\'\'$#%%)2)(%-/+/1+775)+)/.**%\'&\'$)39/+.,)#%\'\')*,-))0--)%$%%%"&((\'%&$,59129.\'-(&"#%,//2\'#$$(7:889991**/2(&"\'34.)$)$+#/.*-9:>;=<<:0&*.(18<7+,)*0--&&#%%-..10:,-%$"##),(106,*/*.*)\'.+$/&&702,$%)""#$$*,-+(\'$***%,+,%%,/+.6:3*#&"&\'+%..2((\'(%\'#\'&%%+%-*-(%****&,/10-,)##$$(-7-%&$&%%&"/*-//)/%$))&%"##&,3+3//..30++)-$126446/-,*)&()\',))+:6(+(\'&%)()(%(&\'(\'\')%+.-5-2.+2(+,578221.800$+0)\',\'-(,,)*.--;/++))(\'$&%*2+7--*+\'*\'(*)***,++-4,+()+).\'$*)*(&%$$*&#))%("$$%\'))()&&+&(*&-\')&"%&#$#%\'(&&/5.+*-95//+#&#%(*()&%&&&$&$%$*+74/2<<<26545/.,%""$3-#716:898::0:0;//-/2018;660/.4*3322343<<2-,%+,+$);<2712**0)26561106,+8;:92\'%$(\'.01-1.4=;3:213(1,**)%#%%%,&),-,*)(&"$&+*\'%),(),-6940-*&+\',2*+(/*+#%\'"&)270..%$#,/5569=700/\'&\'%(8\'.#3/0.,*((\'$$$#$#%((*.*."#$()*2)+*)16.)*%&$$&\')2.231.4**&\'\'"(%0+**"#"%#&%&&%21/:<8--,-)0\'\'#*)-,)-.08.)(\'(&$#**(\'&

In [82]:
read_1.query_sequence

'AGTTACGTATTGTTGGCGTCTGTTTGTGTTTAAACTGAGGTCCTAAGTCAGAGAGGTTCCTGGTACTCACTGGCAATACCCCCATATTCAAAGCATTCCCTTCACCTATTTGTCTACAGACCTCTCATTGAGAAAAATAGCACCTACAGTACCATAATTTAAGAGAGGGTGTAATAAATTCTCTCTCTGTTTTTGGTTGCCTATAACTTCCTCCTATTAATTGAGTTTTGTTGCTCACAGGCGTTGGAGCATCACAAGGCGGTTTTGCCAAGACTGCAACATTATTTTTCTCGACGTGGACCTGTGCGCGCAGCAAATCCTAAAAGCGCAAAGTGCATGTCCGCTTTTTTAAGACTTTTTTATTTTATTAATCAAGTCAGATGTTTCTGATTTTTCCCTCACTTTTATGTTCTCAAGCTTTTTCCCCACTTTTCTTTTTCTCTCTTTTGTTTCCCCCTGCCTTTTTCCTTTCTTCTCTCTTTTATTATTTCAGATTATCTCTTTTTTTTTATTTTACTTTTTCTTACTTATTTTTTTTTTCTTGTTTTTGCTGAGCTATATGCACAACTACATTTATTTTCATACACAACAACAAAAGTTGAAAGACAGAGTGTGTACCTTAAGATTTTGGAGGGTGTGCACCTGCTTCTTGGAGGCACACTCTGAGCACAATTTTTGGTGTAGCACACCTTGTGCACTCTTAAGAATCAGGAGTGCACAGTCCATGCACTCCATTTTGGAGGCACACTCCAGCACTCCAACCCAACATTTTTGGCAACTAATCTTGAGGCAGGATTGTGCACTCCAAGAATCTTGAGGTACATGTATTGTGTCGGCTTGTGTGCACTCCAACTTCCCCTTAGGCTTCGAGATCACATGAAGCCCTGATTTTCCCACCTTCCCGCCTTCAGCACAGCAAAAACCACATCCGCGAATAAGACATATCCAAGATATTAGAAGATATTGATCTCATTTGACTTTCTTTTATGATTCCATCAT

In [83]:
read_1.query

'AACATTATTTTTCTCGACGTGGACCTGTGCGCGCAGCAAATCCTAAAAGCGCAAAGTGCATGTCCGCTTTTTTAAGACTTTTTTATTTTATTAATCAAGTCAGATGTTTCTGATTTTTCCCTCACTTTTATGTTCTCAAGCTTTTTCCCCACTTTTCTTTTTCTCTCTTTTGTTTCCCCCTGCCTTTTTCCTTTCTTCTCTCTTTTATTATTTCAGATTATCTCTTTTTTTTTATTTTACTTTTTCTTACTTATTTTTTTTTTCTTGTTTTTGCTGAGCTATATGCACAACTACATTTATTTTCATACACAACAACAAAAGTTGAAAGACAGAGTGTGTACCTTAAGATTTTGGAGGGTGTGCACCTGCTTCTTGGAGGCACACTCTGAGCACAATTTTTGGTGTAGCACACCTTGTGCACTCTTAAGAATCAGGAGTGCACAGTCCATGCACTCCATTTTGGAGGCACACTCCAGCACTCCAACCCAACATTTTTGGCAACTAATCTTGAGGCAGGATTGTGCACTCCAAGAATCTTGAGGTACATGTATTGTGTCGGCTTGTGTGCACTCCAACTTCCCCTTAGGCTTCGAGATCACATGAAGCCCTGATTTTCCCACCTTCCCGCCTTCAGCACAGCAAAAACCACATCCGCGAATAAGACATATCCAAGATATTAGAAGATATTGATCTCATTTGACTTTCTTTTATGATTCCATCATGTTATTTTTCAGTCATTATGAAAATTATTCCTGTCTCTGGTGAACCCACTCAAAAGTTGTGCACAGAGTGTGCACTCCATGACTTTTTGGAGTGCACACCCATGCACTCCAATAATCTTGGAAAGTGCACCAACCTGCCACCCACTTTTGGTGTGAGAAATCACTTGAGGCATAGAGTGTGCAGACTCCAAAAATGTTGAGTGGCATGGATTGTGCACTCCACAATTCTTGAGTGCACAAGGTGTGCAGCCACCAAAAGTTGTAAGCTTCAGTGCAC

In [84]:
for read in contig_19_reads:
    print('@%s' % read.query_name, file=fastq_out)
    print('%s' % read.query, file=fastq_out)
    print('+', file=fastq_out)
    print('%s' % read.qqual, file=fastq_out)

In [85]:
fastq_out.close()

In [86]:
len(contig_19_reads)

1651