## Genetic comparison between the dominant strain and vaccine strain and reprocubily selected strains

## 0. General

### 0.1. Load libraries

In [1]:
import os, dendropy, math, sys
import pandas as pd, numpy as np
from Bio import SeqIO
import calendar

from datetime import date, timedelta
from dateutil.relativedelta import relativedelta



### 0.2. General variables

In [6]:
start_mature_protein = 17
HA1_length_AA = 329 #in mature protein 
protein_length = 567
sequence_length = 1701

In [2]:
region_hemispheres = {"us":"nh", "europe":"nh", "aunz":"sh"}
hemispheres = ["nh", "sh"]

In [3]:
pp = 24 #preceding period in months
pdur = 16 #period duration
flu_seasons = {h:{} for h in hemispheres}
vaccine_selection = {h:{} for h in hemispheres}
for y in range(2002, 2024):
    #northern hemisphere flu season
    season = f"{y}-{y+1}"
    if y != 2023:
        flu_seasons["nh"][season] = (date(y, 10, 1), date(y+1, 4, 30))
        #northern hemisphere vaccine strain selection moment
        vaccine_selection["nh"][season] = date(y,2,1)  
    
    if y==2002:
        continue
    #southern hemisphere flu season
    flu_seasons["sh"][str(y)] = (date(y, 3, 1), date(y, 9, 30))
    #southern hemisphere vaccine strain selection moment
    vaccine_selection["sh"][str(y)] = date(y-1,9,1)

season_periods = {h:{} for h in hemispheres}
period_dates = {}
for h, sd in flu_seasons.items():
    for season, (fss, fse) in sd.items():
        vsd = vaccine_selection[h][season]
        #something with periods in treason doesn't seem to be correct but don't wanna waste time on that for now
        if h == "nh":
            ps, pe  = vsd + relativedelta(months=-pp), vsd + relativedelta(months=+pdur-1)
        else:
            ps, pe  = vsd + relativedelta(months=-pp), vsd + relativedelta(months=+pdur-1)

        
        period = f"{str(ps.year)[2:]}{'0'+str(ps.month) if len(str(ps.month))==1 else str(ps.month)}-{str(pe.year)[2:]}{'0'+str(pe.month) if len(str(pe.month))==1 else str(pe.month)}"
        season_periods[h][season] = period
        period_dates[period] = (ps, pe)

#cut off for HA1 
early_season_cutoff = {'nh':'2011-2012', 'sh':'2011'}
early_season_cutoff_dates = {h:flu_seasons[h][season][-1] for h, season in early_season_cutoff.items()}
early_seasons = [] #get list
for h, cutoff in early_season_cutoff.items():
    csons = list(flu_seasons[h].keys())
    for i, season in enumerate(csons):
        if i <= csons.index(cutoff):
            early_seasons.append(season)

In [4]:
epitope_sites= {"A":[122,124,126,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,168], 
                "B":[128,129,155,156,157,158,159,163,165,186,187,188,189,190,192,193,194,196,197,198], 
                "C":[44,45,46,47,48,50,51,53,54,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312], 
                "D":[96,102,103,117,121,167,170,171,172,173,174,175,176,177,179,182,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,], 
                "E":[57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,109,260,261,262,265]}
epitope_positions = [s for sites in epitope_sites.values() for s in sites]

#rbs
rbs_positions = [98, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 144, 145, 146, 153, 154, 155, 156, 157, 158, 159,
                 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 193, 194, 195, 196, 219, 220, 221, 222, 223, 224, 
                 225, 226, 227, 228,]
#rbs_positions = [131, 132, 133, 134, 135, 136, 137, 138, 140, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228,]
koel_sites = [145, 189, 193, 156, 159, 158, 155]

#amino acid list for coding 
aa_list = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]

### 0.3. functions

In [22]:
def get_mutations(ref, seq, sl="complete"):
    muts = []
    for i, b in enumerate(ref):
        if seq[i] == "X" or b == "X":
            continue 
        if seq[i] != b:
            if sl == "HA1":
                pos = start_mature_protein+i
                mpos = i+1
                protein = "HA1"
            else: # if sl=="complete"
                pos = i+1
                mpos = pos-start_mature_protein+1
                protein = "signal protein" if pos<start_mature_protein else "HA1" if pos<=HA1_length_AA else "HA2"
            ep = True if mpos in epitope_positions else False
            rbs = True if mpos in rbs_positions else False
            if ep:
                for s, pos_list in epitope_sites.items():
                    if mpos in pos_list:
                        site = s
            else:
                site = pd.NA
            muts.append([b, seq[i], pos, mpos, protein, ep, site, rbs])
    muts = pd.DataFrame.from_records(muts, columns=["dominant strain AA", "AA", "position", "mature position", "protein", "epitope", "epitope site", "RBS"])
    return muts

## 1. Get data

### 1.1. WHO vaccine strains

In [10]:
#vaccine strains per hemisphere per season year
#egg-based strain recommendation as listed on the WHO website
vaccine_strain_names = {'nh': {'2024-2025':'A/Thailand/8/2022', '2023-2024':'A/Darwin/9/2021', '2022-2023':'A/Darwin/9/2021', '2021-2022':'A/Cambodia/e0826360/2020',
                               '2020-2021':'A/Hong Kong/2671/2019', '2019-2020':'A/Kansas/14/2017', '2018-2019':'A/Singapore/INFIMH-16-0019/2016', '2017-2018':'A/Hong Kong/4801/2014',
                               '2016-2017':'A/Hong Kong/4801/2014','2015-2016':'A/Switzerland/9715293/2013','2014-2015':'A/Texas/50/2012','2013-2014':'A/Texas/50/2012',
                               '2012-2013':'A/Victoria/361/2011','2011-2012':'A/Perth/16/2009','2010-2011':'A/Perth/16/2009','2009-2010':'A/Brisbane/10/2007',
                               '2008-2009':'A/Brisbane/10/2007','2007-2008':'A/Wisconsin/67/2005','2006-2007':'A/Wisconsin/67/2005','2005-2006':'A/California/7/2004',
                               '2004-2005':'A/Fujian/411/2002','2003-2004':'A/Moscow/10/99','2002-2003':'A/Moscow/10/99','2001-2002':'A/Moscow/10/99','2000-2001':'A/Moscow/10/99'},
                        'sh': {'2024':'A/Thailand/8/2022','2023':'A/Darwin/9/2021','2022':'A/Darwin/9/2021','2021':'A/Hong Kong/2671/2019','2020':'A/South Australia/34/2019',
                               '2019':'A/Switzerland/8060/2017','2018':'A/Singapore/INFIMH-16-0019/2016','2017':'A/Hong Kong/4801/2014','2016':'A/Hong Kong/4801/2014',
                               '2015':'A/Switzerland/9715293/2013','2014':'A/Texas/50/2012','2013':'A/Victoria/361/2011','2012':'A/Perth/16/2009','2011':'A/Perth/16/2009',
                               '2010':'A/Perth/16/2009','2009':'A/Brisbane/10/2007','2008':'A/Brisbane/10/2007','2007':'A/Wisconsin/67/2005','2006':'A/California/7/2004',
                               '2005':'A/Wellington/1/2004','2004':'A/Fujian/411/2002','2003':'A/Moscow/10/99','2002':'A/Moscow/10/99','2001':'A/Moscow/10/99',
                               '2000':'A/Moscow/10/99'}
                        }

vaccine_strain_dir = "../data/vaccine_strains/H3N2"

vaccine_strains = {}
for f in os.listdir(vaccine_strain_dir):
    strain = f.split(".")[0].split("_")
    strain[1] = strain[1].replace("-", " ")
    strain = "/".join(strain)
    
    #get full cds and amino acid sequence 
    cds = [r for r in SeqIO.parse(os.path.join(vaccine_strain_dir, f), "fasta")][0].seq[:1701]
    aa_seq = cds.translate()
    aa_seq = aa_seq[:protein_length]

    #get proteins - signal protein
    signal_prot_nuc = cds[:(start_mature_protein*3)-1] 
    signal_prot_aa = aa_seq[:(start_mature_protein)-1]
    #HA1 
    HA1_nuc = cds[(start_mature_protein*3)-1:((start_mature_protein*3)+(HA1_length_AA*3))-1]
    HA1_aa = aa_seq[start_mature_protein-1:+start_mature_protein+HA1_length_AA-1]
    #HA2
    HA2_nuc = cds[((start_mature_protein*3)-1+(HA1_length_AA*3)):]
    HA2_aa = aa_seq[start_mature_protein-1+HA1_length_AA:]

    vaccine_strains[strain] = {"nuc":{"complete":cds, "signal protein":signal_prot_nuc, "HA1":HA1_nuc, "HA2":HA2_nuc, "mature protein":HA1_nuc+HA2_nuc},
                               "aa":{"complete":aa_seq, "signal protein":signal_prot_aa, "HA1":HA1_aa, "HA2":HA2_aa, "mature protein":HA1_aa+HA2_aa}}


### 1.2. Dominant strain

In [17]:
dominant_strain_file = '../data/dominant_strains.fasta'
dominant_strains = []
for r in SeqIO.parse(dominant_strain_file, "fasta"):
    region, season = r.id.split("_")[0], r.id.split("_")[1]
    dominant_strains.append([region, season, r.seq])

dominant_strains = pd.DataFrame.from_records(dominant_strains, columns=["region", "season", "sequence"]).set_index(["region", "season"]).sort_index()

### 1.3. Reproducible selection strains

In [27]:
reproducible_selection_strain_file = "../data/reproducible_selection_strains.fasta"
reproducible_selection_strains = []
for r in SeqIO.parse(reproducible_selection_strain_file, "fasta"):
    region, season, months, timing = r.id.split("_")
    reproducible_selection_strains.append([region, season, months, timing, r.seq])

reproducible_selection_strains = pd.DataFrame.from_records(reproducible_selection_strains, columns=["region", "season", "months", "timing", "sequence"]).set_index(["region", "season", "timing"]).sort_index()

## 2. Genetic comparison

### 2.1. Dominant strain vs. vaccine strain


In [57]:
#get vaccine strain mutations
vaccine_mutations = []
for region, h in region_hemispheres.items():
    for season, (fss, fse) in flu_seasons[h].items():

        #get vaccine strain
        if season in early_seasons:
            vs = vaccine_strains[vaccine_strain_names[h][season]]['aa']['HA1']
        else:
            vs = vaccine_strains[vaccine_strain_names[h][season]]['aa']['complete']

        #get dominant strain
        ds = dominant_strains.loc[(region, season), "sequence"]
        #get mutations and make df
        if season in early_seasons:
            muts = get_mutations(ds, vs, "HA1")
        else:
            muts = get_mutations(ds, vs, "complete")

        muts[["region", "season"]] = region,season

        try:
            vaccine_mutations = pd.concat([vaccine_mutations, muts], ignore_index=True)
        except:
            vaccine_mutations = muts



vaccine_mutations["mutation"] = [f'{row["AA"]}{row["position"]}{row["dominant strain AA"]}' for i, row in vaccine_mutations.iterrows()]
vaccine_mutations["H3 mutation"] = [f'{row["AA"]}{row["mature position"]}{row["dominant strain AA"]}' if row["mature position"]>0 else pd.NA for i, row in vaccine_mutations.iterrows() ]

vaccine_mutations = vaccine_mutations.set_index(["region", "season"]).sort_index()

### 2.2. Dominant strain vs. reproducible vaccine strain selection at WHO-timing

In [58]:
rs_whotiming_mutations = []
timing = "WHO-timing"

for region, h in region_hemispheres.items():
    for season, (fss, fse) in flu_seasons[h].items():

        #get reproducible selection strain
        rs = reproducible_selection_strains.loc[(region, season, timing), "sequence"]
        #get dominant strain
        ds = dominant_strains.loc[(region, season), "sequence"]

        if season in early_seasons:
            muts = get_mutations(ds, rs, "HA1")
        else:
            muts = get_mutations(ds, rs, "complete")


        muts[["region", "season"]] = region,season
        try:
            rs_whotiming_mutations = pd.concat([rs_whotiming_mutations, muts], ignore_index=True)
        except:
            rs_whotiming_mutations = muts
        

rs_whotiming_mutations ["mutation"] = [f'{row["AA"]}{row["position"]}{row["dominant strain AA"]}' for i, row in rs_whotiming_mutations.iterrows()]
rs_whotiming_mutations["H3 mutation"] = [f'{row["AA"]}{row["mature position"]}{row["dominant strain AA"]}' if row["mature position"]>0 else pd.NA for i, row in rs_whotiming_mutations.iterrows() ]

rs_whotiming_mutations = rs_whotiming_mutations.set_index(["region", "season"]).sort_index()


### 2.3. Dominant strain vs. reproducible vaccine strain selection at later timing

In [59]:
rs_later_mutations = []
timing = "delayed-timing"

for region, h in region_hemispheres.items():
    for season, (fss, fse) in flu_seasons[h].items():

        #get reproducible selection strain
        rs = reproducible_selection_strains.loc[(region, season, timing), "sequence"]
        #get dominant strain
        ds = dominant_strains.loc[(region, season), "sequence"]

        if season in early_seasons:
            muts = get_mutations(ds, rs, "HA1")
        else:
            muts = get_mutations(ds, rs, "complete")


        muts[["region", "season"]] = region,season
        try:
            rs_later_mutations = pd.concat([rs_later_mutations, muts], ignore_index=True)
        except:
            rs_later_mutations = muts
        

rs_later_mutations ["mutation"] = [f'{row["AA"]}{row["position"]}{row["dominant strain AA"]}' for i, row in rs_later_mutations .iterrows()]
rs_later_mutations ["H3 mutation"] = [f'{row["AA"]}{row["mature position"]}{row["dominant strain AA"]}' if row["mature position"]>0 else pd.NA for i, row in rs_later_mutations.iterrows() ]

rs_later_mutations = rs_later_mutations.set_index(["region", "season"]).sort_index()


## 3. Summaries results

In [60]:
summary_dir = "../analysis/genetic_comparison_results"

### 3.1. Number of mutations 
- number of total mutations
- number of mutations in classically define epitope sites
- number of mutations in koel sites


In [61]:
genetic_comparison_summary = []
for region, h in region_hemispheres.items():
    for season, (fss, fse) in flu_seasons[h].items():

        for comparison, df in {"vaccine strain":vaccine_mutations, 
                               "reproducible selection at WHO-timing":rs_whotiming_mutations,
                               "reproducible selection at delayed timing":rs_later_mutations}.items():
            try:
                sdf  = df.loc[(region, season)]
                n_mutations = len(sdf)
                n_epitope_mutations = len(sdf[sdf["epitope"]==True])
                n_koel_sites = len(sdf[sdf["mature position"].isin(koel_sites)])
                genetic_comparison_summary.append([region, season, comparison, n_mutations, n_epitope_mutations, n_koel_sites])
            except:
                genetic_comparison_summary.append([region, season, comparison, 0, 0, 0])
                

genetic_comparison_summary = pd.DataFrame.from_records(genetic_comparison_summary, columns=["region", "season","comparison", "n mutations", "n epitope mutations", "n mutations in koel sites"])

In [62]:
genetic_comparison_summary.to_csv(os.path.join(summary_dir, "genetic_comparison_summary.csv"), index=False)

### 3.2. individual mutations

In [63]:
for comparison, df in {"vaccine strain":vaccine_mutations, "reproducible selection at WHO-timing":rs_whotiming_mutations,
                       "reproducible selection at delayed timing":rs_later_mutations}.items():
    
    fn = f"mutations_dominant_strain_vs_{comparison.replace(' ', '_')}.csv"
    df.reset_index().to_csv(os.path.join(summary_dir,fn ), index=False)