### Preparation for running Nanopore methylation calling on NCI

This script does the following:

- pull out all aligned fastq for each contig using previously mapped Nanopores reads with Minimap2
- pack them up 
- pull out all corresponding fast5 files and pack them up

In [56]:
import os
import pysam
import pandas as pd
import glob
import tarfile #compress fast5
from Bio import SeqIO

In [84]:
#first we need to define the base dirs
DIRS ={}
#DIRS['BASE'] = '/home/ap/mock_up/methylation_calling/nanopore' #home computer. hash out later
DIRS['BASE'] = '/home/anjuni/methylation_calling/nanopore' #fisher
DIRS['BAM_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'minimap2_alignments' )
DIRS['FAST5_TOMBO'] = os.path.join(DIRS['BASE'], 'input', 'all_fast5', 'tombo_fast5')
DIRS['FAST5_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'all_fast5')
DIRS['FASTQ_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fastq')
DIRS['FAST5_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fast5')
DIRS['REF_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_ref')

#fix this here for reference
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/' # Pst_104E_v13_ph_ctg.fa

In [58]:
#define functions

#to compress the fast5 reads mapping contig 19 to a tar.gz file
def make_tarfile(output_filename, file_list):
    with tarfile.open(output_filename, "w:gz") as tar:
        count=0
        for file in file_list:
            tar.add(file, arcname=os.path.basename(file))
            count=count+1
        print(count)

In [59]:
##Get headings

seq_sum_albacore_fh = os.path.join(DIRS['FAST5_INPUT'], 'albacore_fastq/Pst79_run1-4_1d_sequencing_summary.txt')
#only read in the first two columns instead of everything (file name and read ID)
seq_sum_df = pd.read_csv(seq_sum_albacore_fh, sep='\t')

#reduce the size of the seq_sum dataframe to only contain the filename and read_id column thats all we need
small_df = seq_sum_df.iloc[:, [0,1]].copy()


  interactivity=interactivity, compiler=compiler, result=result)


seq_sum_df.head # check if it shows the file name and read ID only. (shows all heading)

In [60]:
small_df.head() #check format of dataframe

Unnamed: 0,filename,read_id
0,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,12756ced-c758-42f7-a359-ba477a20b3a1
1,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,15749e40-174c-4bd3-9341-8cb37fde491b
2,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,23703e5f-144e-405b-aba1-b50ce853ecd9
3,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,24dd4f66-0a77-4a45-8b5f-8beb2d1576b0
4,rsb0001410_20170719_FAH05512_MN21513_mux_scan_...,2f4921ba-e9a3-44da-90d0-53815af5fcd7


In [61]:
#quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [62]:
##Make file paths for BAM files and reference genome

#we generated the BAM file handle (file path)
bam_pass_fh = os.path.join(DIRS['BAM_INPUT'], 'Pst79_run1-4_1d_pass.minimap2.out.bam')
bam_fail_fh = os.path.join(DIRS['BAM_INPUT'], 'Pst79_run1-4_1d_fail.minimap2.out.bam')

#make a list of paths for BAM files to get mapped reads (for all 4 runs)
bam_fh_list = [os.path.join(DIRS['BAM_INPUT'], x) for x in os.listdir(DIRS['BAM_INPUT']) if x.endswith('.bam')]

######fix this here for reference. use the same as for mapping the long reads
reference_fh = os.path.join(DIRS['REF'], 'Pst_104E_v13_ph_ctg.fa')

split_reference_fh = os.path.join(DIRS['REF_OUT'], 'tombo_pcontig_019.fasta') #output folder

In [14]:
#print(bam_fh_list) #check file paths

In [31]:
#just an example to loop over content of a folder
for x in os.listdir(DIRS['BAM_INPUT']):
    if x.endswith('.bam') or x.endswith('.bai'):
        print(x)

Pst79_run1-4_1d_fail.minimap2.out.bam
Pst79_run1-4_1d_pass.minimap2.out.bam
Pst79_run1-4_1d_pass.minimap2.out.bam.bai
Pst79_run1-4_1d_fail.minimap2.out.bam.bai


In [16]:
#print(bam_fh_list) #check for all BAM files
#print(split_reference_fh) #check path

In [63]:
#now we 'read' in an bam file. Really we generated an AlignmentFile object 
bam_pass_file = pysam.AlignmentFile(bam_pass_fh, "rb")
bam_fail_file = pysam.AlignmentFile(bam_fail_fh, "rb")

In [64]:
bam_file_list = [bam_pass_file, bam_fail_file]

In [65]:
bam_file_list

[<pysam.libcalignmentfile.AlignmentFile at 0x7f17a3a41f28>,
 <pysam.libcalignmentfile.AlignmentFile at 0x7f178befa268>]

In [None]:
#write a fasta file with all contig_19 reads in reference genome
for seq in  SeqIO.parse(reference_fh, 'fasta'):
    if seq.id == 'pcontig_019':
        SeqIO.write(seq, split_reference_fh, 'fasta')

In [66]:
#get all the reads (not fastq) for contig_19 from the BAM file
contig_19_reads_in_bam = []

count_fail = 0
count_pass = 0

for index,bam in enumerate(bam_file_list):
    for read in bam.fetch(contig='pcontig_019'):
        if index==0:
            count_pass = count_pass + 1
        else:
            count_fail = count_fail + 1
        contig_19_reads_in_bam.append(read) 

In [67]:
len(contig_19_reads_in_bam)

8030

In [90]:
contig_19_reads_in_bam[1]

<pysam.libcalignedsegment.AlignedSegment at 0x7f178beff4c8>

In [69]:
print('This is the number of fail reads mapped:%i\nThis is the number of pass reads mapped:%i' % (count_fail, count_pass))

This is the number of fail reads mapped:2408
This is the number of pass reads mapped:5622


In [24]:
#this provides the name for all references in the bam file
#bam_file.references

In [91]:
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_tombo1.fastq')

In [92]:
#the outfile for the ids of fastqs mapping to pcontig_019
read_id_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_tombo.read_id')

#we generate an new file and write out all the aligned reads in fastq format
#we added in an save guard to save out each read only once as it appears that pysam provides some reads in duplicate.
saved_reads = []
with open(read_id_fh, mode='w') as out:
    for read in contig_19_reads_in_bam:
        if read.query_name not in saved_reads:
            print(read.query_name, file=out)
            saved_reads.append(read.query_name)

In [93]:
albacore_fastq_fh = os.path.join('/home/anjuni/methylation_calling/nanopore/input/all_fastq/albacore_fastq', 'Pst79_run1-4_1d_all.fastq')

In [94]:
bbmap_DIR = '/home/anjuni/scripts/BBMap/sh'

In [95]:
#Matching fastq to fast5 note: single-threaded, low memory
!filterbyname.sh in={albacore_fastq_fh} out={fastq_out_fh} names={read_id_fh} include=t

java -ea -Xmx285828m -cp /home/anjuni/anaconda3/opt/bbmap-37.95/current/ driver.FilterReadsByName in=/home/anjuni/methylation_calling/nanopore/input/all_fastq/albacore_fastq/Pst79_run1-4_1d_all.fastq out=/home/anjuni/methylation_calling/nanopore/input/split_fastq/pcontig_019_tombo1.fastq names=/home/anjuni/methylation_calling/nanopore/input/split_fastq/pcontig_019_tombo.read_id include=t
Executing driver.FilterReadsByName [in=/home/anjuni/methylation_calling/nanopore/input/all_fastq/albacore_fastq/Pst79_run1-4_1d_all.fastq, out=/home/anjuni/methylation_calling/nanopore/input/split_fastq/pcontig_019_tombo1.fastq, names=/home/anjuni/methylation_calling/nanopore/input/split_fastq/pcontig_019_tombo.read_id, include=t]

Input is being processed as unpaired
Time:               95.614 seconds.
Reads Processed:    1095024 	11.45k reads/sec
Bases Processed:    8055376818 	84.25m bases/sec
Reads Out:          7167
Bases Out:          128010080


In [96]:
len(saved_reads)

7167

In [97]:
#we briefly check the longest aligned read
max_len = 0
with pysam.FastxFile(fastq_out_fh) as fh:
    for entry in fh:
        if len(entry.sequence) > max_len:
            max_len = len(entry.sequence)
print('Longest alinged read to contig_19 is %i long'  % max_len)

Longest alinged read to contig_19 is 130111 long


# FIX TOMBO FAST5 INPUT PATH WITH ***/**/*/

In [98]:
### Making fast5 files with the same filenames as fastq, for methylation-calling

#get all the fast5 filenames for the reads that map to contig 19
fast5_names_contig_19 = list(small_df[small_df.read_id.isin(saved_reads)]['filename'])

#this looks for fast5s recursively in all the Fast5_input folder
all_fast5s = [fn for fn in glob.iglob('%s/**/*.fast5' % DIRS['FAST5_TOMBO'], recursive=True)]

#this gets the whole path of the fast5s that map to contig_19
fast5s_contig_19_fh = [x for x in all_fast5s if x.split('/')[-1] in fast5_names_contig_19]

In [101]:
fast5_names_contig_19[7166] #check first element

'rsb_u5580079_20170828_FAH05280_MN16968_sequencing_run_Pst79_RSB_OpenDay_Run_4_55730_read_1672_ch_48_strand.fast5'

In [105]:
fast5s_contig_19_fh[7166]

IndexError: list index out of range

In [52]:
len(saved_reads)

7167

In [54]:
len(set(fast5_names_contig_19))

7167

#make iteration with all mapped fast5

fast5_mapped=[]
for x in all_fast5s:
    fast5_mapped.append(os.path.join(DIRS["FAST5_OUT"],'pcontig_019_aln_fast5.tar.gz'))

In [55]:
### tarzip the mapped fast5 and move to outfolder
fast5_mapped = os.path.join(DIRS["FAST5_OUT"],'pcontig_019_tombo_fast5.tar.gz')
make_tarfile(fast5_mapped, fast5s_contig_19_fh)

0


#checks

#format of fastq BAM files mapped to contig_19
read_1 = contig_19_reads[0]

#seq_sum_df.columns # to find name column

#show the last line (???)
seq_sum_df[seq_sum_df.read_id == read_1.query_name]['filename'].to_string().split(' ')[-1]


test_fastq = pysam.FastxFile(fastq_out_fh) #BAM file fastq mapping to contig_19
test_fastq.close()

fastq_out.write(read_1.header) # (???)


#Using methods to show fastq lines for fast5
#read_1.header #header (???)

#read_1.qname #deprecated template name (line 1)
#read_1.query_name #template name (line 1)

#read_1.seq #deprecated read fastq seq
#read_1.query_sequence #read fastq sequence
#read_1.query #aligned sequence (line 2)

#read_1.qual #quality score
#read_1.qqual #alignment quality score (line 4)

len(read_1.query) == len(read_1.qqual) #seq and quality score are the same length