### Preparation for running Nanopore methylation calling on NCI

This script does the following:

- pull out all aligned fastq for each contig using previously mapped Nanopores reads with Minimap2
- pack them up 
- pull out all corresponding fast5 files and pack them up

In [1]:
!pip install pysam



In [4]:
import os
import pysam
import pandas as pd
import glob
import tarfile #compress fast5
from Bio import SeqIO

In [5]:
#first we need to define the base dirs
DIRS ={}
#DIRS['BASE'] = '/home/ap/mock_up/methylation_calling/nanopore' #home computer. hash out later
DIRS['BASE'] = '/home/anjuni/methylation_calling/nanopore' #fisher
DIRS['BAM_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'minimap2_alignments' )
DIRS['FAST5_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'all_fast5') #fix for all runs and copy from nci to right directory
DIRS['FASTQ_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fastq')
DIRS['FAST5_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fast5')
DIRS['REF_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_ref')
#fix this here for reference
DIRS['REF'] = '/home/anjuni/Pst_104_v13_assembly/' # Pst_104E_v13_ph_ctg.fa

In [6]:
#define functions

#to compress the fast5 reads mapping contig 19 to a tar.gz file
def make_tarfile(output_filename, file_list):
    with tarfile.open(output_filename, "w:gz") as tar:
        for file in file_list:
            tar.add(file, arcname=os.path.basename(file))

In [7]:
##Get headings

seq_sum_albacore_fh = os.path.join(DIRS['FAST5_INPUT'], 'albacore_fastq/Pst79_run1-4_1d_sequencing_summary.txt')
#only read in the first two columns instead of everything (file name and read ID)
seq_sum_df = pd.read_csv(seq_sum_albacore_fh, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


seq_sum_df.head # check if it shows the file name and read ID

In [8]:
#quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [9]:
##Make file paths for BAM files and reference genome

#we generated the BAM file handle (file path)
bam_fh = os.path.join(DIRS['BAM_INPUT'], 'Pst79_run1_1d_pass.minimap2.out.bam')

#make a list of paths for BAM files to get mapped reads (for all 4 runs)
bam_fh_list = [os.path.join(DIRS['BAM_INPUT'], x) for x in os.listdir(DIRS['BAM_INPUT']) if x.endswith('.bam')]

######fix this here for reference. use the same as for mapping the long reads
reference_fh = os.path.join(DIRS['REF'], 'Pst_104E_v13_ph_ctg.fa')

split_reference_fh = os.path.join(DIRS['REF_OUT'], 'ref_pcontig_019.fasta') #output folder

In [10]:
#just an example to loop over content of a folder
for x in os.listdir(DIRS['BAM_INPUT']):
    if x.endswith('.bam') or x.endswith('.bai'):
        print(x)

Pst79_run1_1d_pass.minimap2.out.bam
Pst79_run1_1d_pass.minimap2.out.bam.bai


In [11]:
#print(bam_fh_list) #check for all BAM files
#print(split_reference_fh) #check path

In [12]:
#now we 'read' in an bam file. Really we generated an AlignmentFile object 
bam_file = pysam.AlignmentFile(bam_fh, "rb")

In [13]:
#write a fasta file with all contig_19 reads in reference genome
for seq in  SeqIO.parse(reference_fh, 'fasta'):
    if seq.id == 'pcontig_019':
        SeqIO.write(seq, split_reference_fh, 'fasta')

In [14]:
#get all the reads for contig_19 from the BAM file
contig_19_reads = [read for read in bam_file.fetch(contig='pcontig_019')]

In [82]:
len(contig_19_reads)

1651

In [15]:
#this provides the name for all references in the bam file
#bam_file.references

In [83]:
#the outfile for the fastq files mapping to pcontig_019
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

#we generate an new file and write out all the aligned reads in fastq format
#we added in an save guard to save out each read only once as it appears that pysam provides some reads in duplicate.
saved_reads = []
with open(fastq_out_fh, mode='w') as fastq_out:
    for read in contig_19_reads:
        if read.query_name not in saved_reads:
            print('@%s' % read.query_name, file=fastq_out)
            print('%s' % read.query, file=fastq_out)
            print('+', file=fastq_out)
            print('%s' % read.qqual, file=fastq_out)
            saved_reads.append(read.query_name)

In [17]:
#we briefly check the longest aligned read
max_lenght = 0
with pysam.FastxFile(fastq_out_fh) as fh:
    for entry in fh:
        if len(entry.sequence) > max_lenght:
            max_lenght = len(entry.sequence)
print('Longest alinged read to contig_19 is %i long'  % max_lenght)

Longest alinged read to contig_19 is 60786 long


In [18]:
### Making fast5 files with the same filenames as fastq, for methylation-calling

#reduce the size of the seq_sum dataframe to only contain the filename and read_id column thats all we need
small_df = seq_sum_df.iloc[:, [0,1]].copy()

In [29]:
#small_df.head() #check format of dataframe

In [19]:
#get all the fast5 filenames for the reads that map to congtig 19
fast5_names_contig_19 = list(small_df[small_df.read_id.isin(saved_reads)]['filename'])

#this looks for fast5s recursively in all the Fast5_input folder
all_fast5s = [fn for fn in glob.iglob('%s/**/*.fast5' % DIRS['FAST5_INPUT'], recursive=True)]

#this gets the whole path of the fast5s that map to contig_19
contig_19_fast5s_fh = [x for x in all_fast5s if x.split('/')[-1] in fast5_names_contig_19]

In [20]:
#fast5_names_contig_19[0] #check first element

'rsb0001410_20170719_FAH05512_MN21513_mux_scan_Pst79_1_10K_47691_read_101_ch_109_strand.fast5'

In [23]:
len(contig_19_fast5s_fh)

570

In [31]:
fast5_names

NameError: name 'fast5_names' is not defined

In [70]:
len(fast5_names_contig_19)

1481

In [26]:
len(fast5_names)

NameError: name 'fast5_names' is not defined

#make iteration with all mapped fast5

fast5_mapped=[]
for x in all_fast5s:
    fast5_mapped.append(os.path.join(DIRS["FAST5_OUT"],'pcontig_019_aln_fast5.tar.gz'))

In [37]:
### tarzip the mapped fast5 and move to outfolder
fast5_mapped = os.path.join(DIRS["FAST5_OUT"],'pcontig_019_aln_fast5.tar.gz')
make_tarfile(fast5_mapped, contig_19_fast5s_fh)

In [None]:
#format of fastq BAM files mapped to contig_19
read_1 = contig_19_reads[0]

#seq_sum_df.columns # to find name column

#show the last line (???)
seq_sum_df[seq_sum_df.read_id == read_1.query_name]['filename'].to_string().split(' ')[-1]

In [51]:
test_fastq = pysam.FastxFile(fastq_out_fh) #BAM file fastq mapping to contig_19
test_fastq.close()

#shows all filenames
for entry in test_fastq:
    print(entry.name)

In [73]:
fastq_out.write(read_1.header) # (???)

TypeError: write() argument must be str, not pysam.libcalignmentfile.AlignmentHeader

In [62]:
#Using methods to show fastq lines for fast5
#read_1.header #header (???)

In [63]:
#read_1.qname #deprecated template name (line 1)
#read_1.query_name #template name (line 1)

In [64]:
#read_1.seq #deprecated read fastq seq
#read_1.query_sequence #read fastq sequence
#read_1.query #aligned sequence (line 2)

In [61]:
#read_1.qual #quality score
#read_1.qqual #alignment quality score (line 4)

In [None]:
len(read_1.query) == len(read_1.qqual) #seq and quality score are the same length

In [85]:
#help(read_1)

In [84]:
fastq_out = open(fastq_out_fh, mode='w')

for read in contig_19_reads:
    print('@%s' % read.query_name, file=fastq_out)
    print('%s' % read.query, file=fastq_out)
    print('+', file=fastq_out)
    print('%s' % read.qqual, file=fastq_out)

fastq_out.close()

In [81]:
len(contig_19_reads)

1651