# Import libraries 

In [5]:
from os.path import expanduser
from importlib.machinery import SourceFileLoader
from tqdm.notebook import tqdm

import pandas as pd 
import numpy as np
import re
import os
import itertools 

import multiprocessing as mp 
import ete3
import subprocess

from matplotlib import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

# load custom flu and ngs libraries 
laeb_lib = expanduser("./python_lib") # folder where custom libraries are saved 
fc = SourceFileLoader('fc', "%s/flu_common.py"%(laeb_lib)).load_module()
ngs = SourceFileLoader('ngs', "%s/laeb_ngs_pipeline.py"%(laeb_lib)).load_module()

# Inputs 

In [12]:
# inputs 
# file path to data folder - fastq files to be analysed must be in {data folder}/raw
data_folder = './data' 
# reference fasta file name (should be placed in input_folder)
ref_fasta_fname = './input/H1N1pdm09_Cali09.fasta' 
# CSV file containing the CDR regions of each gene segment (numbering should be based on that of the given reference sequence)
cds_coords = "./input/CDS_H1N1pdm09_Cali09.csv"
nucpos_shift = "./input/CDS_shift_H1N1pdm09_Cali09.csv"
# file path to metadata file. 
meta_fname = './results/metadata_w_covbreadth.csv' 

threadnum = 4 # number of CPU threads for parallelization 

# variant calling options
min_cov = 50 # minimum coverage 
min_var_freq = 0
min_var_prop = 0.02 # minimum variant proportion 
err_tol = 0.01 # threshold to which variant called could result from base calling error 
min_breadth = 0.7 # min breadth of gene segment to be mapped for further analysis 

In [13]:
# presets 
reffasta = ref_fasta_fname

# initialise
gene_to_proteinorf, influenza_gene_len, sorted_refnames, nucpos_shift = ngs.initialisation(cds_coords, reffasta, laeb_lib, nucpos_shift=nucpos_shift)
gene_to_proteinorf


Initialising CDS coordinates...

Check translated protein sequences...
PB2 MERIKELRDLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPALRMKWMMAMRYPITADKRIMDMIPERNEQGQTLWSKTNDAGSDRVMVSPLAVTWWNRNGPTTSTVHYPKVYKTYFEKVERLKHGTFGPVHFRNQVKIRRRVDTNPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLAITKEKKEELQDCKIAPLMVAYMLERELVRKTRFLPVAGGTGSVYIEVLHLTQGTCWEQMYTPGGEVRNDDVDQSLIIAARNIVRRAAVSADPLASLLEMCHSTQIGGVRMVDILRQNPTEEQAVDICKAAIGLRISSSFSFGGFTFKRTSGSSVKKEEEVLTGNLQTLKIRVHEGYEEFTMVGRRATAILRKATRRLIQLIVSGRDEQSIAEAIIVAMVFSQEDCMIKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGIESIDNVMGMIGILPDMTPSTEMSLRGIRVSKMGVDEYSSTERVVVSIDRFLRVRDQRGNVLLSPEEVSETQGTEKLTITYSSSMMWEINGPESVLVNTYQWIIRNWEIVKIQWSQDPTMLYNKMEFEPFQSLVPKATRSRYSGFVRTLFQQMRDVLGTFDTVQIIKLLPFAAAPPEQSRMQFSSLTVNVRGSGLRILVRGNSPVFNYNKATKRLTVLGKDAGALTEDPDEGTSGVESAVLRGFLILGKEDKRYGPALSINELSNLAKGEKANVLIGQGDVVLVMKRKRDSSILTDSQTATKRIRMAIN*
PB1 MDVNPTLLFLKIPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRTHQYSEKGKWTTNTETGAPQLNPIDGPLPEDNEPSGYAQTDCVLEAMAFLEESHPGIFENSCLETMEVVQQTRVDKLTQGRQTYDWTLNRNQPAATALANTIEVFRSNGLTANE

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aa,frame
gene,nuc,protein,Unnamed: 3_level_1,Unnamed: 4_level_1
1-PB2,1,PB2,1,1
1-PB2,2,PB2,1,2
1-PB2,3,PB2,1,3
1-PB2,4,PB2,2,1
1-PB2,5,PB2,2,2
...,...,...,...,...
8-NS,834,NEP,126,2
8-NS,835,NEP,126,3
8-NS,836,NEP,127,1
8-NS,837,NEP,127,2


# Read metadata and variant calling files

In [14]:
# metadata 
## metadata must have 'sampid' header which is used as sample identifier 
meta_df = pd.read_csv('%s'%meta_fname)
sorted_sampid = sorted(set(meta_df['sampid']))

meta_df = meta_df.set_index("sampid")
display (meta_df.head())

# vcf 
variant_call_df = pd.read_csv("./results/variant_call_MinCoV%i_MinProp%.2f_MinFreq%i_ErrTol%.2f.csv"%(min_cov, min_var_prop, min_var_freq, err_tol))
variant_call_df = variant_call_df.set_index(["sampid", 'gene'])
display (variant_call_df.head())

Unnamed: 0_level_0,subject_id,enrolD,household,date,project,short_name,CT,age,sex,timepoint,...,ost_ed,ost_days,1-PB2,2-PB1,3-PA,4-HA,5-NP,6-NA,7-M,8-NS
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3-20,11-001,0,,13/11/09,Hanoi,11-001-d0,17.0,10.0,Female,2.0,...,11.0,1;11,1.0,0.978012,1.0,0.970606,1.0,1.0,1.0,1.0
3-22,11-002,0,,17/11/09,Hanoi,11-002-d0,17.28,18.0,Female,4.0,...,13.0,3;13,1.0,1.0,1.0,0.985303,1.0,1.0,1.0,1.0
3-24,11-006,2,,12/11/09,Hanoi,11-006-d2,27.42,38.5,Male,4.0,...,9.0,1;9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3-1,11-1015,0,,22/10/09,Hanoi,11-1015-d0,25.0,39.4,Female,2.0,...,7.0,3;7,0.945175,0.956025,1.0,1.0,1.0,1.0,1.0,1.0
3-3,11-1015,4,,27/10/09,Hanoi,11-1015-d4,33.09,39.4,Female,7.0,...,7.0,3;7,0.320175,0.098505,0.198047,0.764844,0.832999,1.0,1.0,1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,nucpos,nuc_var,nuc_prop,nuc_freq,nuc_consensus,nuc_consensus_freq,nuc_coverage,protein,aapos,HA_num_type,aa_var,aa_prop,aa_freq,expected_aa_consensus,aa_consensus,aa_consensus_freq,aa_coverage,codon_pos,codon,nonsyn
sampid,gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-17,3-PA,204,A,0.153846,14,G,77,91,PA,68,,P,0.162791,14.0,P,P,72.0,86.0,3,CCA,0.0
2-17,3-PA,204,A,0.153846,14,G,77,91,PA-X,68,,P,0.162791,14.0,P,P,72.0,86.0,3,CCA,0.0
2-17,3-PA,1321,C,0.142857,12,A,72,84,PA,441,,L,0.148148,12.0,M,M,69.0,81.0,1,CTG,1.0
2-17,3-PA,1848,A,0.130435,12,G,80,92,PA,616,,S,0.134831,12.0,S,S,77.0,89.0,3,TCA,0.0
2-17,4-HA,249,A,0.044776,3,G,64,67,HA,75,H3,E,0.044776,3.0,E,E,64.0,67.0,3,GAA,0.0


# Get consensus and putative minority haplotype sequences

In [15]:
if not os.path.isdir("./reference"): 
    os.mkdir("./reference")

def get_consensus_minority_seq(sampid):
    gene_to_consensus_seq_row = []
    
    # check that all gene segment satisfy min_breadth 
    if (meta_df.loc[sampid, sorted_refnames]>min_breadth).all() == False: 
        return gene_to_consensus_seq_row

    # read map_nuc_results 
    map_nuc_results = pd.read_csv('./results/map_nuc_results_%s.csv'%(sampid))
    map_nuc_results = map_nuc_results.set_index(["Gene", "Position"])

    # sample_vcf 
    try:
        sample_vcf = variant_call_df.loc[sampid]
    except: 
        sample_vcf = None

    for gene in set(map_nuc_results.index.get_level_values(0)): 

        # add consensus sequence 
        sequence = {position:map_nuc_results.loc[(gene, position), "Consensus"] if (gene, position) in list(map_nuc_results.index) else "-" for position in range(1, influenza_gene_len[gene]+1)}
        conseq = "".join([sequence[pos] for pos in sorted(sequence.keys())])
        gene_to_consensus_seq_row.append({"sampid":sampid, "gene":gene, "seq":conseq, "seqtype":"C"})

        # add putative minority haplotype sequence 
        try: 
            gene_sample_vcf = sample_vcf.loc[gene].copy()
        except: 
            continue 

        if isinstance(gene_sample_vcf, pd.Series):
            gene_sample_vcf = gene_sample_vcf.to_frame().T

        gene_sample_vcf = gene_sample_vcf.drop_duplicates(["nucpos", "nuc_var"])
        gene_sample_vcf = gene_sample_vcf.set_index(['nucpos'])
        
        pos_to_var = {}
        for pos in list(gene_sample_vcf.index):
            nuc_var = gene_sample_vcf.loc[pos, "nuc_var"]
            if isinstance(nuc_var, str): # single variant 
                pos_to_var[pos] = [gene_sample_vcf.loc[pos, "nuc_var"]]
            else: # multi variant 
                pos_to_var[pos] = list(gene_sample_vcf.loc[pos, 'nuc_var'])
        
        minseq = [pos_to_var[pos] if pos in pos_to_var else [sequence[pos]] for pos in range(1, influenza_gene_len[gene]+1)]
        for midx, mseq in enumerate(list(itertools.product(*minseq))): 
            gene_to_consensus_seq_row.append({"sampid":sampid, "gene":gene, "seq":"".join(mseq), "seqtype":"M%i"%(midx+1)})
        
    return gene_to_consensus_seq_row

if os.path.isfile("./reference/consensus_sequences.csv"):
    gene_to_consensus_seq = pd.read_csv("./reference/consensus_sequences.csv")
else:
    """for sampid in sorted_sampid:
        get_consensus_minority_seq(sampid)"""
    pool = mp.Pool(processes=threadnum)
    results = [pool.apply_async(get_consensus_minority_seq, args=(sampid,)) for sampid in sorted_sampid]
    output = [p.get() for p in results]

    gene_to_consensus_seq = [k for v in list(filter(None, output)) for k in v]
    gene_to_consensus_seq = pd.DataFrame.from_dict(gene_to_consensus_seq)

    # save to file 
    gene_to_consensus_seq.to_csv("./reference/consensus_sequences.csv", index=False)

gene_to_consensus_seq = gene_to_consensus_seq.set_index(["sampid", "gene", "seqtype"])
display (gene_to_consensus_seq.head()) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seq
sampid,gene,seqtype,Unnamed: 3_level_1
2-18,8-NS,C,ATGGACTCCAACACCATGTCAAGCTTTCAGGTAGACTGTTTCCTTT...
2-18,8-NS,M1,ATGGACTCCAACACCATGTCAAGCTTTCAGGTAGACTGTTTCCTTT...
2-18,5-NP,C,ATGGCGTCTCAAGGCACCAAACGATCATATGAACAAATGGAGACTG...
2-18,5-NP,M1,ATGGCGTCTCAAGGCACCAAACGATCATATGAACAAATGGAGACTG...
2-18,7-M,C,ATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTTTCTATCATCC...


# Concatenate consensus sequences of all gene segments 

In [19]:
reference_genome = fc.parsefasta(reffasta)

# write fasta 
if os.path.isfile("./reference/concatenated_wgs.fasta") == False:
    with open("./reference/concatenated_wgs.fasta", "w") as output: 
        output.write(">%s\n%s\n"%(re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1]),
                                  "".join([reference_genome[gene] for gene in sorted_refnames]).upper()))

        for sampid in set(gene_to_consensus_seq.index.get_level_values(0)): 
            #  write concatenated consensus sequence 
            sample_gene_to_consensus_seq = gene_to_consensus_seq.loc[sampid]

            header = "%s|%s"%(sampid, "C")
            sequence =  "".join([sample_gene_to_consensus_seq.loc[(gene, "C"), "seq"] for gene in sorted_refnames])
            output.write(">%s\n%s\n"%(header, sequence))

            # write putative minority haplotype sequence 
            for seqtype in set(sample_gene_to_consensus_seq.index.get_level_values(1)): 
                if seqtype == "C": 
                    continue 

                header = "%s|%s"%(sampid, seqtype)
                sequence =  []
                for gene in sorted_refnames: 
                    if (gene, seqtype) in sample_gene_to_consensus_seq.index: 
                        sequence.append(sample_gene_to_consensus_seq.loc[(gene, seqtype), "seq"])
                    else: 
                        sequence.append(sample_gene_to_consensus_seq.loc[(gene, "C"), "seq"])
                output.write(">%s\n%s\n"%(header, "".join(sequence))) 

if os.path.isfile("./reference/mafft_concatenated_wgs.fasta") == False:
    cmd = ["mafft", "./reference/concatenated_wgs.fasta", ">", "./reference/mafft_concatenated_wgs.fasta"]
    subprocess.call(" ".join(cmd), shell=True)

# Reconstruct phylogenetic tree

In [22]:
# build tree 
if os.path.isfile("./reference/concatenated_wgs.fasta.treefile") == False:
    cmd = ['iqtree', '-nt', 'AUTO', '-ntmax', str(threadnum), 
           '-m', 'GTR+I+G4', 
           '-s', './reference/mafft_concatenated_wgs.fasta']
    subprocess.call(cmd)

# Generate newick tree for printing and annotation dataframe

In [23]:
# generate ggtree dataframe and newick tree file
# get treetime annotations 
tree_annotations = []

ete3_tree = ete3.Tree("./reference/mafft_concatenated_wgs.fasta.treefile")
ete3_tree.set_outgroup(re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1]))
ete3_tree.ladderize()

for leaf in ete3_tree.get_leaves():
    # skip vaccine root 
    if (leaf.name == re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1])): 
        print_name = leaf.name
        subject_id = ""
        seq_type = "Consensus" 
    else: 
        sampid, stype = leaf.name.split("|")
        subject_id = meta_df.loc[sampid, "subject_id"]
        timepoint = meta_df.loc[sampid, "timepoint"]
        """print (subject_id, timepoint, stype)
        continue"""

        timepoint = meta_df.loc[sampid, "timepoint"]
        if pd.isna(timepoint): 
            timepoint = "D-NaN"
        else: 
            timepoint = "D%i"%(int(timepoint))

        if stype == "C": 
            seq_type = "Consensus"
        else: 
            if subject_id == "H296/S03" or subject_id == "H089/S04":
                minority_idx = int(re.search("M(\d+)$", leaf.name).group(1))
                seq_type = "Minor-%i"%(minority_idx)
            else:
                seq_type = "Minor"

        print_name = "_".join([subject_id, timepoint, seq_type])
        
        
    tree_annotations.append({"index":leaf.name, "print_name":print_name, 
                             "show_name":1, 
                             "show_tip":1 if seq_type == "Consensus" else 0, 
                             "subject_id":subject_id})

# save for ggtree use 
tree_annotations = pd.DataFrame.from_dict(tree_annotations).set_index("index")
tree_annotations.to_csv("./reference/ggtree_concatenated_wgs.meta.csv")
ete3_tree.write(outfile="./reference/ggtree_concatenated_wgs.nwk", format=1)