# Import libraries

In [1]:
from os.path import expanduser
from importlib.machinery import SourceFileLoader
from tqdm.notebook import tqdm

import pandas as pd 
import numpy as np
import re
import os
import itertools 

import multiprocessing as mp 
import ete3
import subprocess

from matplotlib import cm
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

# load custom flu and ngs libraries 
laeb_lib = expanduser("../PYTHONLIB") # folder where custom libraries are saved 
fc = SourceFileLoader('fc', "%s/flu_common.py"%(laeb_lib)).load_module()
ngs = SourceFileLoader('ngs', "%s/laeb_ngs_pipeline.py"%(laeb_lib)).load_module()



# Inputs 

In [2]:
# inputs 
# file path to data folder - fastq files to be analysed must be in {data folder}/raw
data_folder = './data' 
# reference fasta file name (should be placed in input_folder)
ref_fasta_fname = './input/H3N2_Bris07.fasta' 
# CSV file containing the CDR regions of each gene segment (numbering should be based on that of the given reference sequence)
cds_coords = "./input/CDS_H3N2_Bris07.csv"
# file path to metadata file. 
meta_fname = './results/metadata_w_covbreadth.csv' 

threadnum = 4 # number of CPU threads for parallelization 

# variant calling options
min_cov = 100 # minimum coverage 
min_var_freq = 0 # minimum number of variants to be called 
min_var_prop = 0.02 # minimum variant proportion 
err_tol = 0.01 # threshold to which variant called could result from base calling error 
min_breadth = 0.7 # min breadth of gene segment to be mapped for further analysis 

In [3]:
# presets 
reffasta = ref_fasta_fname

# initialise
gene_to_proteinorf, influenza_gene_len, sorted_refnames, nucpos_shift = ngs.initialisation(cds_coords, reffasta, laeb_lib)


Initialising CDS coordinates...

Check translated protein sequences...
PB2 MERIKELRNLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPSLRMKWMMAMKYPITADKRITEMVPERNEQGQTLWSKMSDAGSDRVMVSPLAVTWWNRNGPVTSTVHYPKVYKTYFDKVERLKHGTFGPVHFRNQVKIRRRVDINPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLTITKEKKEELRDCKISPLMVAYMLERELVRKTRFLPVAGGTSSIYIEVLHLTQGTCWEQMYTPGGGVRNDDVDQSLIIAARNIVRRAAVSADPLASLLEMCHSTQIGGTRMVDILRQNPTEEQAVDICKAAMGLRISSSFSFGGFTFKRTSGSSVKKEEEVLTGNLQTLKIRVHEGYEEFTMVGKRATAILRKATRRLVQLIVSGRDEQSIAEAIIVAMVFSQEDCMIKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGVEHIDSVMGMIGVLPDMTPSTEMSMRGIRVSKMGVDEYSSTERVVVSIDRFLRVRDQRGNVLLSPEEVSETQGTERLTITYSSSMMWEINGPESVLVNTYQWIIRNWEAVKIQWSQNPAMLYNKMEFEPFQSLVPKAIRSQYSGFVRTLFQQMRDVLGTFDTTQIIKLLPFAAAPPKQSRMQFSSLTVNVRGSGMRILVRGNSPVFNYNKTTKRLTILGKDAGTLIEDPDESTSGVESAVLRGFLIIGKEDRRYGPALSINELSNLAKGEKANVLIGQGDVVLVMKRKRDSSILTDSQTATKRIRMAIN*
PB1 MDVNPTLLFIKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRTHQYSEKGKWTTNTETGAPQLNPIDGPLPEDNEPSGYAQTDCVLEAMAFLEESHPGIFENSCLETMEAVQQTRVDKLTQGRQTYDWTLNRNQPAATALANTIEVFRSNGLTANE

# Read metadata and variant calling files

In [4]:
# metadata 
## metadata must have 'sampid' header which is used as sample identifier 
meta_df = pd.read_csv('%s'%meta_fname)
sorted_sampid = sorted(set(meta_df['sampid']))

meta_df = meta_df.set_index("sampid")
display (meta_df.head())

# vcf 
variant_call_df = pd.read_csv("./results/variant_call_MinCoV%i_MinProp%.2f_MinFreq%i_ErrTol%.2f.csv"%(min_cov, min_var_prop, min_var_freq, err_tol))
variant_call_df = variant_call_df.set_index(["sampid", 'gene', "nucpos"])
display (variant_call_df.head())

Unnamed: 0_level_0,subject_id,enrolD,idx,ct,SampleType,aliquot,run,primer_i5,primer_i7,enrol-onset,...,timepoint,patch,1-PB2,2-PB1,3-PA,4-HA,5-NP,6-NA,7-M,8-NS
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H3N2_1205_T0_T_S517_N701_R,1205,0,R1,26,T,1,1,S517,N701,6.0,...,6.0,0,1.0,1.0,0.962963,1.0,1.0,0.844828,1.0,0.945946
H3N2_1205_T1_N_S517_N702_R,1205,1,R2,25,N,1,1,S517,N702,6.0,...,7.0,0,1.0,1.0,1.0,1.0,1.0,0.931034,1.0,1.0
H3N2_1205_T2_T_S517_N703_R,1205,2,R3,30,T,1,1,S517,N703,6.0,...,8.0,0,1.0,1.0,0.907407,0.985294,0.866667,0.844828,1.0,1.0
H3N2_1205_T3_T_S517_N704_R,1205,3,R4,29,T,1,1,S517,N704,6.0,...,9.0,0,1.0,1.0,0.962963,1.0,0.966667,0.844828,1.0,1.0
H3N2_1205_T4_T_S517_N705_R,1205,4,R5,28,T,1,1,S517,N705,6.0,...,10.0,0,1.0,1.0,0.962963,1.0,1.0,0.844828,1.0,1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,nuc_var,nuc_prop,nuc_freq,nuc_consensus,nuc_consensus_freq,nuc_coverage,protein,aapos,HA_num_type,aa_var,aa_prop,aa_freq,expected_aa_consensus,aa_consensus,aa_consensus_freq,aa_coverage,codon_pos,codon,nonsyn
sampid,gene,nucpos,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H3N2_1205_T0_T_S517_N701_R,1-PB2,1221,A,0.080098,293,G,3358,3658,PB2,407,,E,0.082117,287.0,E,E,3196.0,3495.0,3,GAA,1.0
H3N2_1205_T0_T_S517_N701_R,4-HA,939,G,0.07357,265,A,3336,3602,HA,297,H3,V,0.074179,262.0,V,V,3267.0,3532.0,3,GTG,1.0
H3N2_1205_T0_T_S517_N701_R,4-HA,1275,C,0.044221,88,T,1902,1990,HA,409,H3,L,0.042916,83.0,L,L,1839.0,1934.0,3,CTC,1.0
H3N2_1205_T0_T_S517_N701_R,5-NP,1058,T,0.03274,125,C,3689,3818,NP,353,,F,0.03327,122.0,S,S,3531.0,3667.0,2,TTT,0.0
H3N2_1205_T0_T_S517_N701_R,5-NP,1150,A,0.123601,276,G,1955,2233,NP,384,,R,0.124709,268.0,G,G,1871.0,2149.0,1,AGG,0.0


# Get consensus and putative minority haplotype sequences for each sample

In [5]:
if not os.path.isdir("./reference"): 
    os.mkdir("./reference")

def get_consensus_minority_seq(sampid):
    gene_to_consensus_seq_row = []
    
    # check that all gene segment satisfy min_breadth 
    if (meta_df.loc[sampid, sorted_refnames]>min_breadth).all() == False: 
        return gene_to_consensus_seq_row

    # read map_nuc_results 
    map_nuc_results = pd.read_csv('./results/map_nuc_results_%s.csv'%(sampid))
    map_nuc_results = map_nuc_results.set_index(["Gene", "Position"])

    # sample_vcf 
    try:
        sample_vcf = variant_call_df.loc[sampid]
    except: 
        sample_vcf = None

    for gene in set(map_nuc_results.index.get_level_values(0)): 

        # add consensus sequence 
        sequence = {position:map_nuc_results.loc[(gene, position), "Consensus"] if (gene, position) in list(map_nuc_results.index) else "-" for position in range(1, influenza_gene_len[gene]+1)}
        conseq = "".join([sequence[pos] for pos in sorted(sequence.keys())])
        gene_to_consensus_seq_row.append({"sampid":sampid, "gene":gene, "seq":conseq, "seqtype":"C"})

        # add putative minority haplotype sequence 
        try: 
            gene_sample_vcf = sample_vcf.loc[gene].copy()
        except: 
            continue 

        if isinstance(gene_sample_vcf, pd.Series):
            gene_sample_vcf = gene_sample_vcf.to_frame().T

        gene_sample_vcf = gene_sample_vcf.drop_duplicates(["nucpos", "nuc_var"])
        gene_sample_vcf = gene_sample_vcf.set_index(['nucpos'])
        
        pos_to_var = {}
        for pos in list(gene_sample_vcf.index):
            nuc_var = gene_sample_vcf.loc[pos, "nuc_var"]
            if isinstance(nuc_var, str): # single variant 
                pos_to_var[pos] = [gene_sample_vcf.loc[pos, "nuc_var"]]
            else: # multi variant 
                pos_to_var[pos] = list(gene_sample_vcf.loc[pos, 'nuc_var'])
        
        minseq = [pos_to_var[pos] if pos in pos_to_var else [sequence[pos]] for pos in range(1, influenza_gene_len[gene]+1)]
        for midx, mseq in enumerate(list(itertools.product(*minseq))): 
            gene_to_consensus_seq_row.append({"sampid":sampid, "gene":gene, "seq":"".join(mseq), "seqtype":"M%i"%(midx+1)})
        
    return gene_to_consensus_seq_row

if os.path.isfile("./reference/consensus_sequences.csv"):
    gene_to_consensus_seq = pd.read_csv("./reference/consensus_sequences.csv")
else:
    pool = mp.Pool(processes=threadnum)
    results = [pool.apply_async(get_consensus_minority_seq, args=(sampid,)) for sampid in sorted_sampid]
    output = [p.get() for p in results]

    gene_to_consensus_seq = [k for v in list(filter(None, output)) for k in v]
    gene_to_consensus_seq = pd.DataFrame.from_dict(gene_to_consensus_seq)

    # save to file 
    gene_to_consensus_seq.to_csv("./reference/consensus_sequences.csv", index=False)

gene_to_consensus_seq = gene_to_consensus_seq.set_index(["sampid", "gene", "seqtype"])
display (gene_to_consensus_seq.head()) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,seq
sampid,gene,seqtype,Unnamed: 3_level_1
H3N2_1205_T0_T_S517_N701_R,3-PA,C,ATGGAAGATTTTGTGCGACAATGCTTCAACCCGATGATTGTCGAAC...
H3N2_1205_T0_T_S517_N701_R,4-HA,C,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...
H3N2_1205_T0_T_S517_N701_R,4-HA,M1,ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCG...
H3N2_1205_T0_T_S517_N701_R,5-NP,C,ATGGCGTCCCAAGGCACCAAACGGTCTTATGAACAGATGGAAACTG...
H3N2_1205_T0_T_S517_N701_R,5-NP,M1,ATGGCGTCCCAAGGCACCAAACGGTCTTATGAACAGATGGAAACTG...


# Concatenate consensus sequences of all gene segments 

In [6]:
reference_genome = fc.parsefasta(reffasta)

# write fasta 
if os.path.isfile("./reference/concatenated_wgs.fasta") == False:
    with open("./reference/concatenated_wgs.fasta", "w") as output: 
        output.write(">%s\n%s\n"%(re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1]), 
                                  re.sub("TGATAAAAAACACCCTTGTTTCTAC$", "", "".join([reference_genome[gene] for gene in sorted_refnames]).upper())))

        for sampid in set(gene_to_consensus_seq.index.get_level_values(0)): 
            #  write concatenated consensus sequence 
            sample_gene_to_consensus_seq = gene_to_consensus_seq.loc[sampid]

            header = "%s|%s"%(sampid, "C")
            sequence =  "".join([sample_gene_to_consensus_seq.loc[(gene, "C"), "seq"] for gene in sorted_refnames])
            output.write(">%s\n%s\n"%(header, sequence))

            # write putative minority haplotype sequence 
            for seqtype in set(sample_gene_to_consensus_seq.index.get_level_values(1)): 
                if seqtype == "C": 
                    continue 

                header = "%s|%s"%(sampid, seqtype)
                sequence =  []
                for gene in sorted_refnames: 
                    if (gene, seqtype) in sample_gene_to_consensus_seq.index: 
                        sequence.append(sample_gene_to_consensus_seq.loc[(gene, seqtype), "seq"])
                    else: 
                        sequence.append(sample_gene_to_consensus_seq.loc[(gene, "C"), "seq"])
                output.write(">%s\n%s\n"%(header, "".join(sequence))) 

# Reconstruct phylogenetic tree

In [7]:
# build tree 
if os.path.isfile("./reference/concatenated_wgs.fasta.treefile") == False:
    cmd = ['iqtree', '-nt', 'AUTO', '-ntmax', str(threadnum), 
           '-m', 'GTR+I+G4', 
           '-s', './reference/concatenated_wgs.fasta']
    subprocess.call(cmd)

# Generate newick tree for printing and annotation dataframe

In [8]:
# generate ggtree dataframe and newick tree file
# get annotations 
tree_annotations = []

ete3_tree = ete3.Tree("./reference/concatenated_wgs.fasta.treefile")
ete3_tree.set_outgroup(re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1]))
ete3_tree.ladderize()

for leaf in ete3_tree.get_leaves():
    # skip vaccine root 
    if (leaf.name == "H3N2_Bris07"): 
        print_name = leaf.name
        subject_id = ""
        seq_type = "Consensus" 
    else: 
        subject_id = re.search("H3N2_(\d+)", leaf.name).group(1)
        sampid = re.sub("\|.+$", "", leaf.name)

        timepoint = meta_df.loc[sampid, "timepoint"]
        if pd.isna(timepoint): 
            timepoint = "D-NaN"
        else: 
            timepoint = "D%i"%(int(timepoint))

        if re.search("C$", leaf.name): 
            seq_type = "Consensus"
        else: 
            if subject_id == "1707":
                minority_idx = int(re.search("M(\d+)$", leaf.name).group(1))
                seq_type = "Minor-%i"%(minority_idx)
            else:
                seq_type = "Minor"

        print_name = "_".join([subject_id, timepoint, seq_type])
        
    tree_annotations.append({"index":leaf.name, "print_name":print_name, 
                                 "show_name":1, 
                                 "show_tip":1 if subject_id in ["1673", "1878"] else 0, 
                                 "subject_id":subject_id})

# save for ggtree use 
tree_annotations = pd.DataFrame.from_dict(tree_annotations).set_index("index")
tree_annotations.to_csv("./reference/ggtree_concatenated_wgs.meta.csv")
ete3_tree.write(outfile="./reference/ggtree_concatenated_wgs.nwk", format=1)

# Parse concatenated genome tree and identify samples that may have derived from mixed infections/contaminated sammples

In [9]:
tree = ete3.Tree('./reference/concatenated_wgs.fasta.treefile', format=1)
tree.set_outgroup(re.sub("\.fasta$", "", ref_fasta_fname.split("/")[-1]))
tree.ladderize()

subject_to_ca = {}
subject_to_headers = []
sampid_mixed_infections = {}

for subject_id in set(meta_df['subject_id']): 

    subject_meta_df = meta_df[meta_df['subject_id']==subject_id]
    subject_sampid_analysed = list(set(subject_meta_df.index)&set(gene_to_consensus_seq.index.get_level_values(0)))
    if (len(subject_sampid_analysed)) == 0:
        continue 
    
    all_subject_headers = []
    for sampid in subject_sampid_analysed:
        sample_gene_to_consensus_seq = gene_to_consensus_seq.loc[sampid]
        all_subject_headers += ["%s|%s"%(sampid, seqtype) for seqtype in set(sample_gene_to_consensus_seq.index.get_level_values(1))]
    
    for header in all_subject_headers: 
        subject_to_headers.append({'subject_id':subject_id, 'header':header})
    
    ca = tree.get_common_ancestor(all_subject_headers)
    subject_to_ca[subject_id] = ca

    
    # all samples from subject are completely monophyletic 
    if set(all_subject_headers) == set(ca.get_leaf_names()): 
        continue 
    else: 
        # possibly mixed 
        sampid_mixed_infections[subject_id] = ca.get_leaf_names()

subject_to_headers = pd.DataFrame.from_dict(subject_to_headers)

for subject_id, clade_headers in sampid_mixed_infections.items():
    all_subject_headers = list(subject_to_headers[subject_to_headers['subject_id']==subject_id]['header'])
    other_subject_headers = list(set(clade_headers)-set(all_subject_headers))
    subject_ca = subject_to_ca[subject_id]
    
    mixed_boolean = 0
    for other_subject in list(set(subject_to_headers[subject_to_headers['header'].isin(other_subject_headers)]['subject_id'])):
        if subject_to_ca[other_subject] not in subject_ca.get_descendants():
            mixed_boolean = 1
    
    print (subject_id, mixed_boolean)

1418 0
1673 0
1682 1
1878 1


In [10]:
outlier_subjects = [1673, 1878]
for subject_id in outlier_subjects: 
    display (meta_df[meta_df['subject_id']==subject_id])
    for sampid in (meta_df[meta_df['subject_id']==subject_id].index): 
        print (subject_id, sampid, len(variant_call_df.loc[sampid].reset_index().drop_duplicates(['gene', 'nucpos', 'nuc_var'])))

Unnamed: 0_level_0,subject_id,enrolD,idx,ct,SampleType,aliquot,run,primer_i5,primer_i7,enrol-onset,...,timepoint,patch,1-PB2,2-PB1,3-PA,4-HA,5-NP,6-NA,7-M,8-NS
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H3N2_1673_T0_T_S507_N701_R,1673,0,R73,30,T,1,1,S507,N701,3.0,...,3.0,0,1.0,1.0,0.944444,1.0,1.0,0.844828,1.0,1.0
H3N2_1673_T1_N_S507_N702_R,1673,1,R74,30,N,1,1,S507,N702,3.0,...,4.0,0,1.0,1.0,1.0,1.0,1.0,0.982759,1.0,1.0
H3N2_1673_T2_T_S507_N703_R,1673,2,R75,34,T,1,1,S507,N703,3.0,...,5.0,0,1.0,1.0,0.87037,0.926471,1.0,0.844828,1.0,1.0


1673 H3N2_1673_T0_T_S507_N701_R 39
1673 H3N2_1673_T1_N_S507_N702_R 48
1673 H3N2_1673_T2_T_S507_N703_R 94


Unnamed: 0_level_0,subject_id,enrolD,idx,ct,SampleType,aliquot,run,primer_i5,primer_i7,enrol-onset,...,timepoint,patch,1-PB2,2-PB1,3-PA,4-HA,5-NP,6-NA,7-M,8-NS
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H3N2_1878_T0_N_S517_N707_R,1878,0,R199,31,N,1,3,S517,N707,5.0,...,5.0,0,0.966667,1.0,0.962963,0.867647,0.75,1.0,1.0,1.0
H3N2_1878_T1_N_S517_N708_R,1878,1,R200,25,N,1,3,S517,N708,5.0,...,6.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
H3N2_1878_T3_N_S517_N709_R,1878,3,R201,33,N,1,3,S517,N709,5.0,...,8.0,0,0.966667,1.0,0.962963,1.0,0.966667,0.965517,1.0,1.0


1878 H3N2_1878_T0_N_S517_N707_R 9
1878 H3N2_1878_T1_N_S517_N708_R 10
1878 H3N2_1878_T3_N_S517_N709_R 75


# Rewrite metadata without likely mixed infection/contaminated samples

In [11]:
meta_df = meta_df[~meta_df.index.isin(['H3N2_1673_T0_T_S507_N701_R', 'H3N2_1673_T1_N_S507_N702_R', 'H3N2_1673_T2_T_S507_N703_R', 'H3N2_1878_T3_N_S517_N709_R'])]
meta_df.to_csv("./results/metadata_wo_mixed_infections.csv")