### Preparation for running Nanopore methylation calling on NCI

This script does the following:

- pull out all aligned fastq for each contig using previously mapped Nanopores reads with Minimap2
- pack them up 
- pull out all corresponding fast5 files and pull them out

In [None]:
!pip install pysam

In [1]:
import os
import pysam
import pandas as pd
import glob
from Bio import SeqIO

In [None]:
#first we need to define the base dirs
DIRS ={}
DIRS['BASE'] = '/home/ap/mock_up/methylation_calling/nanopore'
DIRS['BAM_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'minimap2_alignments' )
DIRS['FAST5_INPUT'] = os.path.join(DIRS['BASE'], 'input', 'all_fast5')
DIRS['FASTQ_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fastq')
DIRS['FAST5_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_fast5')
DIRS['REF_OUT'] = os.path.join(DIRS['BASE'], 'input', 'split_ref')
#fix this here for reference
DIRS['REF'] = ''

In [None]:
seq_sum_albacore_fh = os.path.join(DIRS['FAST5_INPUT'], 'Pst79_run1-4_1d_sequencing_summary.txt')
#only read in the first two columns instead of everything
seq_sum_df = pd.read_csv(seq_sum_albacore_fh, sep='\t')

In [None]:
#quick chech if directories exist
for value in DIRS.values():
    if not os.path.exists(value):
        print('%s does not exist' % value)

In [None]:
#we generated the bam file handle
bam_fh = os.path.join(DIRS['BAM_INPUT'], 'Pst79_run1_1d_pass.minimap2.out.bam')

In [None]:
#just an example to loop over contend of a folder
for x in os.listdir(DIRS['BAM_INPUT']):
    if x.endswith('.bam'):
        print(x)

In [None]:
bam_fh_list = [os.path.join(DIRS['BAM_INPUT'], x) for x in os.listdir(DIRS['BAM_INPUT']) if x.endswith('.bam')]

#fix this here for reference. use the same as for mapping the long reads
reference_fh = os.path.join(DIRS['REF'], )

split_reference_fh = os.path.join(DIRS['REF_OUT'], 'ref_pcontig_019.fasta')

In [None]:
#now we 'read' in an bam file. Really we generated an AlignmentFile object 
bam_file = pysam.AlignmentFile(bam_fh, "rb")

In [None]:
for seq in  SeqIO.parse(reference_fh, 'fasta'):
    if seq.id == 'pcontig_019':
        SeqIO.write(seq, split_reference_fh, 'fasta')

In [None]:
#get all the reads for contig_19
contig_19_reads = [read for read in bam_file.fetch(contig='pcontig_019')]

In [None]:
#the outfile for the fastq files mapping to pcontig_019
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

In [None]:
#we generate an new file and write out all the aligned reads in fastq format
#we added in an save guard to save out each read only once as it appears that pysam provides some reads in duplicate.
saved_reads = []
with open(fastq_out_fh, mode='w') as fastq_out:
    for read in contig_19_reads:
        if read.query_name not in saved_reads:
            print('@%s' % read.query_name, file=fastq_out)
            print('%s' % read.query, file=fastq_out)
            print('+', file=fastq_out)
            print('%s' % read.qqual, file=fastq_out)
            saved_reads.append(read.query_name)

In [None]:
#we briefly check the longest aligned read
max_lenght = 0
with pysam.FastxFile(fastq_out_fh) as fh:
    for entry in fh:
        if len(entry.sequence) > max_lenght:
            max_lenght = len(entry.sequence)
print('Longest alinged read to contig_19 is %i long'  % max_lenght)

In [None]:
#reduce the size of the dataframe to only contain the filename and read_id column thats all we need
small_df = seq_sum_df.iloc[:, [0,1]].copy()

In [None]:
small_df.head()

In [None]:
#get all the fast5 names for the reads that map to congtig 19
fast5_names_contig_19 = list(small_df[small_df.read_id.isin(saved_reads)]['filename'])

In [None]:
fast5_names_contig_19[0]

In [None]:
#this looks for fast5s recursively in all the Fast5_input folder
all_fast5s = [fn for fn in glob.iglob('%s/**/*.fast5' % DIRS['FAST5_INPUT'], recursive=True)]

In [None]:
#this gets the whole path of the fast5s that map to contig_19
contig_19_fast5s_fh = [x for x in all_fast5s if x.split('/')[-1] in fast5_names_contig_19]

In [None]:
len(contig_19_fast5s_fh)

In [None]:
fast5_names

In [None]:
len(fast5_names_contig_19)

In [None]:
len(fast5_names)

In [None]:
seq_sum_df.columns

In [None]:
seq_sum_df[seq_sum_df.read_id == read_1.query_name]['filename'].to_string().split(' ')[-1]

In [None]:
fast5_mapped = os.path.join(DIRS["FAST5_OUT"],'pcontig_019_aln_fast5.tar.gz')

In [None]:
#make iteration with all mapped fast5

In [None]:
#compress the fast5 reads mapping contig 19 to a tar.gz file
import tarfile

def make_tarfile(output_filename, file_list):
    with tarfile.open(output_filename, "w:gz") as tar:
        for file in file_list:
            tar.add(file, arcname=os.path.basename(file))


In [None]:
make_tarfile(fast5_mapped, contig_19_fast5s_fh)

In [None]:
test_fastq = pysam.FastxFile(fastq_out_fh)

In [None]:
for entry in test_fastq:
    print(entry.name)

In [None]:
test_fastq.close()

In [None]:
read_1 = contig_19_reads[0]

In [None]:
len(read_1.seq) == len(read_1.qual)

In [None]:
read_1.qual

In [None]:
fastq_out_fh = os.path.join(DIRS['FASTQ_OUT'], 'pcontig_019_aln.fastq')

In [None]:
fastq_out = open(fastq_out_fh, mode='w')

In [None]:
fastq_out.write(read_1.header)

In [None]:
read_1.seq

In [None]:
read_1.qual

In [None]:
read_1.header

In [None]:
read_1.qname

In [None]:
read_1.query_name

In [None]:
read_1.qqual

In [None]:
read_1.query_sequence

In [None]:
read_1.query

In [None]:
for read in contig_19_reads:
    print('@%s' % read.query_name, file=fastq_out)
    print('%s' % read.query, file=fastq_out)
    print('+', file=fastq_out)
    print('%s' % read.qqual, file=fastq_out)

In [None]:
fastq_out.close()

In [None]:
len(contig_19_reads)