## Filter out molecular clock outlier 

In [1]:
import os, subprocess, dendropy
from Bio import SeqIO
from utils import *
import pandas as pd
from datetime import datetime

In [2]:
#starting with HA > raw dir
raw_fasta = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/raw/HA_gisaid_raw.fasta"
clean_fasta = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/outliers_removed/HA_gisaid_sequences_0019.fasta"
lq_ambig_ids = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/outliers_removed/low_quality_ambig_IDS.csv"
molecular_clock_outliers = "../data/to_drop/HA_gitr.csv"

segment = "HA"

In [41]:
#remove low quality and max ambig sequences
redo = True

mco = []
to_remove = []
if os.path.isfile(molecular_clock_outliers):
    with open(molecular_clock_outliers, "r") as fr:
        for l in fr:
            if "reason" in l:
                continue
            mco.append(l.strip("\n").split(","))
            to_remove.append(l.strip("\n").split(",")[0])

if not os.path.isfile(clean_fasta) or redo==True:
    records = []
    for r in SeqIO.parse(raw_fasta, "fasta"):
        if r.id.split("|")[0] not in to_remove:
            if not check_sequence_length(segment, str(r.seq), 0.95): #check length
                mco.append([r.id.split("|")[0], "too short"])
            elif not check_max_ambig(str(r.seq), 0.01):
                mco.append([r.id.split("|")[0], "too many ambiguous nucleotides"])
            else:
                records.append(r)

    with open(clean_fasta, "w") as fw:
        SeqIO.write(records,fw,"fasta")

In [43]:
clean_metadata = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/outliers_removed/HA_gisaid_metadata_0019.xlsx"
metadata = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/raw/metadata_gisaid_raw.csv"

metadata = pd.read_csv(metadata)

bad_ids = [l[0] for l in mco] 
metadata = metadata[~metadata["Isolate_Id"].isin(bad_ids)]
metadata.to_excel(clean_metadata, index="False")


### run CDHIT
Manually doing this on command line:  
`cd-hit -i {outlier_removed_fasta} -o {cdhit_clus} -c 0.998`

In [31]:
#load cdhit file
cdhit_clus = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/cdhit_output/HA_clus.clstr"

In [32]:
#parse cd hit file
with open(cdhit_clus, "r") as f:
    lines = [l.strip("\n") for l in f]

clusters = {}
for l in lines:
    if l.startswith(">"):
        c = l.lstrip(">")
        clusters[c] = []
    else:
        sid = l.split(">")[-1].split("|")[0]
        clusters[c].append(sid)

In [33]:
#read metadata 
metadata = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/raw/metadata_gisaid_raw.csv"
metadata = pd.read_csv(metadata, usecols=["Isolate_Id", "Isolate_Name", "Passage_History", "Location", "Collection_Date"])
metadata["Collection_Date"] = pd.to_datetime(metadata["Collection_Date"],format="%Y-%m-%d" )

In [34]:
#get season definitions > doing this manually
seasons = {"0001":[datetime(2000,5,1), datetime(2001,4,30)], "0102":[datetime(2001,5,1), datetime(2002,4,30)], "0203":[datetime(2002,5,1), datetime(2003,4,30)],
           "0304":[datetime(2003,5,1), datetime(2004,4,30)], "0405":[datetime(2004,5,1), datetime(2005,4,30)], "0506":[datetime(2005,5,1), datetime(2006,4,30)],
           "0607":[datetime(2006,5,1), datetime(2007,4,30)], "0708":[datetime(2007,5,1), datetime(2008,4,30)], "0809":[datetime(2008,5,1), datetime(2009,4,30)],
           "0910":[datetime(2009,5,1), datetime(2010,4,30)], "1011":[datetime(2010,5,1), datetime(2011,4,30)], "1112":[datetime(2011,5,1), datetime(2012,4,30)],
           "1213":[datetime(2012,5,1), datetime(2013,4,30)], "1314":[datetime(2013,5,1), datetime(2014,4,30)], "1415":[datetime(2014,5,1), datetime(2015,4,30)], 
           "1516":[datetime(2015,5,1), datetime(2016,4,30)], "1617":[datetime(2016,5,1), datetime(2017,4,30)], "1718":[datetime(2017,5,1), datetime(2018,4,30)],
           "1819":[datetime(2018,5,1), datetime(2019,4,30)]}


In [35]:
#split clusters per season
def determine_season(d):
    """
    determine the flu seasons based on the date
    """
    ss = list(seasons.keys())
    season_starts = [v[0] for v in seasons.values()]
    season_ends = [v[-1] for v in seasons.values()]

    if d > season_ends[-1] or d < season_starts[0]:
        return None

    for i, se in enumerate(season_ends):
        if d < se:
            return ss[i]

clus_representatives = {s:{} for s in seasons.keys()}#seq to select per season
season_clus_ids= {s:{} for s in seasons.keys()}#seq to select per season

singles = []

for c, ids in clusters.items():

    if len(ids)==1:
        singles.append(ids[0])
        continue

    dates = [str(np.datetime_as_string(metadata[metadata["Isolate_Id"]==sid]["Collection_Date"].values[0])) for sid in ids]
    dates = [datetime(int(d.split("-")[0]), int(d.split("-")[1]), int(d.split("-")[2].split("T")[0])) for d in dates]
    
    csons = [determine_season(d) for d in dates]
    ids = [sid for i, sid in enumerate(ids) if csons[i]!=None]
    dates = [d for i,d in enumerate(dates) if csons[i]!=None]
    csons = [cson for cson in csons if cson!=None]


    
    #if all seqs in one season select min and max date
    for cson in set(csons):
        cson_ids = [ids[i] for i, x in enumerate(csons) if x == cson]
        cson_dates = [dates[i] for i, x in enumerate(csons) if x == cson]
            
        eldest = cson_ids[cson_dates.index(min(cson_dates))]
        youngest = cson_ids[cson_dates.index(max(cson_dates))]

        season_clus_ids[cson][c] =  cson_ids

        clus_representatives[cson][c] = []
        if eldest not in clus_representatives[cson][c]:
            clus_representatives[cson][c].append(eldest)
        if youngest not in clus_representatives[cson][c]:
            clus_representatives[cson][c].append(youngest)



In [36]:
#get sequence files per batch
clus_dir = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/cluster_seqs/"

season_sets = [["0001", "0102", "0203", "0304", "0405", "0506", "0607", "0708", "0809", "0910"],["1011", "1112", "1213", "1314", "1415"],["1516", "1617", "1718", "1819"]]

for s_set in season_sets:
    ids = []
    for s in s_set:
        for l in clus_representatives[s].values():
            ids.extend(l)

    records = []
    for r in SeqIO.parse(clean_fasta, "fasta"):
        if r.id.split("|")[0] in ids:
            for i in [":", ",", "(", ")", "'"]:
                if i in r.id:
                    r.id = r.id.replace(i, "")
                    r.name = r.name.replace(i, "")
                    r.description = r.description.replace(i, "")
            records.append(r)

    out_fasta = f"{segment}_gisaid_{s_set[0]}_{s_set[-1]}.fasta"
    with open(os.path.join(clus_dir, out_fasta), "w") as fw:
       SeqIO.write(records,fw,"fasta")

### construct MSA
MSA via MAFFt with following command `mafft --auto --thread 3 --keeplength --addfragments {cluster_file} {reference} > {alignment_file}`
removing reference afterwards > as reference used in from 1968 and will f*** with molecular clock

In [37]:
#remove reference from alignment
alignment_dir =  "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/alignment/"

for f in os.listdir(alignment_dir):
    ff = os.path.join(alignment_dir,f)

    records = list(SeqIO.parse(ff, "fasta"))
    if "ref_1968" in records[0].id:
        with open(ff,"w")as fw:
            SeqIO.write(records[1:],fw,"fasta")

### construct tree
Don't need exact precision and want trees as fast as possible, so therefore using fasttree  
command used `fasttree -gtr -nt {alignment} > {tree}`

### Tempest 
getting dates for tempest > takes dates in order of tree  
load annotated trees in tempest > best-fitting root > root-to-tip and residuals to find the outliers

In [38]:
#generate date files
tree_dir = "/Users/annelies/Desktop/flu-evolution/HA_gisaid_sequences/tree/"

for f in os.listdir(tree_dir):
    labels = []
    if f.endswith(".tree") and "annotated" not in f:
        tree = dendropy.Tree.get(path=os.path.join(tree_dir,f), schema="newick")
        for l in tree.leaf_node_iter():
            labels.append(l.taxon.label.replace(" ", "_"))

        label_dates = {}
        for label in labels:
            rid = label.split("|")[0]
            d = metadata[metadata["Isolate_Id"]==rid]["Collection_Date"].values[0].astype(str).split("T")[0]
            label_dates[label] = f"{label}|{d}"

        with open(os.path.join(tree_dir,f), "r") as fr:
            tree_line = fr.readline()

        for label, new_label in label_dates.items():
            tree_line  = tree_line.replace(label, new_label)

        with open(os.path.join(tree_dir,f.replace(".tree", "_annotated.tree")), "w") as fw:
            fw.write(tree_line)
        


## removing problematic clusters

In [39]:
#cluster "eldest" or "youngest" that was outlier according to our tempest analyses
tempest_outliers = {"0001_0910":["EPI_ISL_233238","EPI_ISL_302518", "EPI_ISL_302514", "EPI_ISL_3923", "EPI_ISL_3932", "EPI_ISL_3929", "EPI_ISL_16069",
                                 "EPI_ISL_70476", "EPI_ISL_28455", "EPI_ISL_105790", "EPI_ISL_80412", "EPI_ISL_145076", "EPI_ISL_233238", "EPI_ISL_302518", 
                                 "EPI_ISL_17264942", "EPI_ISL_17264947", "EPI_ISL_17264956", "EPI_ISL_17264980", "EPI_ISL_11562", "EPI_ISL_6712", 
                                 "EPI_ISL_117522", "EPI_ISL_302517","EPI_ISL_24854", "EPI_ISL_24834", "EPI_ISL_24860", "EPI_ISL_24842", "EPI_ISL_24813", 
                                 "EPI_ISL_24865","EPI_ISL_93806", "EPI_ISL_4807", "EPI_ISL_4807", "EPI_ISL_115695", "EPI_ISL_356897", "EPI_ISL_12722961", 
                                 "EPI_ISL_64650", "EPI_ISL_20437","EPI_ISL_12722959", "EPI_ISL_3927", "EPI_ISL_5461", "EPI_ISL_5464", "EPI_ISL_8559",
                                 "EPI_ISL_63362", "EPI_ISL_72835", "EPI_ISL_3927", "EPI_ISL_63363", "EPI_ISL_8516", "EPI_ISL_145204", "EPI_ISL_15548",
                                 "EPI_ISL_23395", "EPI_ISL_23132", "EPI_ISL_144599", "EPI_ISL_6858", "EPI_ISL_14243", "EPI_ISL_136594", "EPI_ISL_115625",
                                 "EPI_ISL_1726483", "EPI_ISL_167319", "EPI_ISL_167293", "EPI_ISL_125870", "EPI_ISL_93917", "EPI_ISL_158599", "EPI_ISL_93872",
                                 "EPI_ISL_84063", "EPI_ISL_5144", "EPI_ISL_14266", "EPI_ISL_14267", "EPI_ISL_5148", "EPI_ISL_93658", "EPI_ISL_17264833",],
                    "1011_1415":["EPI_ISL_134827","EPI_ISL_17264825","EPI_ISL_212165", "EPI_ISL_151100", "EPI_ISL_86074", "EPI_ISL_77053", "EPI_ISL_151115", 
                                 "EPI_ISL_94644", "EPI_ISL_17264934", "EPI_ISL_83714", "EPI_ISL_100826", "EPI_ISL_94618", "EPI_ISL_88045", "EPI_ISL_17264984", 
                                 "EPI_ISL_161477", "EPI_ISL_229118", "EPI_ISL_166393", "EPI_ISL_499026", "EPI_ISL_165317", "EPI_ISL_229124", "EPI_ISL_161475",
                                 "EPI_ISL_161476", "EPI_ISL_17265009", "EPI_ISL_88045", "EPI_ISL_17264984", "EPI_ISL_229118", "EPI_ISL_166393", "EPI_ISL_499026",
                                 "EPI_ISL_165317", "EPI_ISL_229124", "EPI_ISL_161475", "EPI_ISL_161476", "EPI_ISL_17265009", "EPI_ISL_156584", "EPI_ISL_17264829",
                                 "EPI_ISL_156582", "EPI_ISL_260562", "EPI_ISL_84091", "EPI_ISL_230784", "EPI_ISL_156583", "EPI_ISL_175493", "EPI_ISL_83276", 
                                 "EPI_ISL_22911","EPI_ISL_229126","EPI_ISL_206284","EPI_ISL_206292", "EPI_ISL_194494", "EPI_ISL_176500", "EPI_ISL_188894",
                                 "EPI_ISL_194492","EPI_ISL_197887", "EPI_ISL_197873", "EPI_ISL_132030","EPI_ISL_96035", "EPI_ISL_116069", "EPI_ISL_101596", 
                                 "EPI_ISL_132033", "EPI_ISL_197860","EPI_ISL_206283", "EPI_ISL_106816", "EPI_ISL_87887", "EPI_ISL_81375", "EPI_ISL_79316", 
                                 "EPI_ISL_87925", "EPI_ISL_152268", "EPI_ISL_83169", "EPI_ISL_152278", "EPI_ISL_93774", "EPI_ISL_106818", "EPI_ISL_498937", 
                                 "EPI_ISL_106342", "EPI_ISL_88039", "EPI_ISL_85578", "EPI_ISL_77796", "EPI_ISL_152172", "EPI_ISL_79670", "EPI_ISL_93664", 
                                 "EPI_ISL_102988", "EPI_ISL_90824", "EPI_ISL_106813", "EPI_ISL_106822", "EPI_ISL_78726", "EPI_ISL_152096", "EPI_ISL_152363", 
                                 "EPI_ISL_499022", "EPI_ISL_498935","EPI_ISL_272703", "EPI_ISL_94867", "EPI_ISL_89775", "EPI_ISL_94855", "EPI_ISL_128235", 
                                 "EPI_ISL_145217","EPI_ISL_165154", "EPI_ISL_89793", "EPI_ISL_348412", "EPI_ISL_79445", "EPI_ISL_88017", "EPI_ISL_89576",
                                 "EPI_ISL_75787","EPI_ISL_83830", "EPI_ISL_263141", "EPI_ISL_130279", "EPI_ISL_77797", "EPI_ISL_79334", "EPI_ISL_88033", 
                                 "EPI_ISL_91141","EPI_ISL_129164", "EPI_ISL_121531", "EPI_ISL_127843", "EPI_ISL_177908", "EPI_ISL_177906", "EPI_ISL_212164",
                                 "EPI_ISL_212166", "EPI_ISL_153288", "EPI_ISL_164586", "EPI_ISL_168732", "EPI_ISL_252426", "EPI_ISL_499027", "EPI_ISL_166232",
                                 "EPI_ISL_163350", "EPI_ISL_16973", "EPI_ISL_172835", "EPI_ISL_152429", "EPI_ISL_152831", "EPI_ISL_134334", "EPI_ISL_14284868",
                                 "EPI_ISL_121950", "EPI_ISL_152648", "EPI_ISL_152185", "EPI_ISL_129008", "EPI_ISL_83857", "EPI_ISL_79662", "EPI_ISL_16973"
                                 "EPI_ISL_140903",],
                    "1516_1819":["EPI_ISL_277234", "EPI_ISL_341299", "EPI_ISL_234477", "EPI_ISL_367163", "EPI_ISL_234475", "EPI_ISL_232047", "EPI_ISL_232044", 
                                 "EPI_ISL_17264985", "EPI_ISL_329980", "EPI_ISL_378745", "EPI_ISL_536402", "EPI_ISL_278719", "EPI_ISL_278634", "EPI_ISL_329367",
                                 "EPI_ISL_366477", "EPI_ISL_344428", "EPI_ISL_334627", "EPI_ISL_278569", "EPI_ISL_334629", "EPI_ISL_536402", "EPI_ISL_278719",
                                 "EPI_ISL_278634", "EPI_ISL_329367", "EPI_ISL_366477", "EPI_ISL_344428", "EPI_ISL_334627", "EPI_ISL_278569","EPI_ISL_334629", 
                                 "EPI_ISL_207685","EPI_ISL_282907", "EPI_ISL_328642", "EPI_ISL_207687", "EPI_ISL_207783", "EPI_ISL_328691", "EPI_ISL_220128", 
                                 "EPI_ISL_283429", "EPI_ISL_272882", "EPI_ISL_4061575", "EPI_ISL_4062158", "EPI_ISL_344243", "EPI_ISL_367063", "EPI_ISL_327241",
                                 "EPI_ISL_309766", "EPI_ISL_346301", "EPI_ISL_2454697", "EPI_ISL_499029", "EPI_ISL_218060", "EPI_ISL_278549", "EPI_ISL_344272",
                                 "EPI_ISL_278603", "EPI_ISL_334512", "EPI_ISL_239631", "EPI_ISL_334506", "EPI_ISL_327194", "EPI_ISL_242514", "EPI_ISL_253937",
                                 "EPI_ISL_337309", "EPI_ISL_334500", "EPI_ISL_344241", "EPI_ISL_4061581", "EPI_ISL_309391", "EPI_ISL_344280", "EPI_ISL_406156", 
                                 "EPI_ISL_4061593", "EPI_ISL_346310", "EPI_ISL_378193", "EPI_ISL_355892", "EPI_ISL_405914", "EPI_ISL_282916", "EPI_ISL_393707",
                                 "EPI_ISL_197923", "EPI_ISL_197959", "EPI_ISL_263252", "EPI_ISL_277591", "EPI_ISL_289902", "EPI_ISL_1222770", "EPI_ISL_206571", 
                                 "EPI_ISL_197961", "EPI_ISL_323115", "EPI_ISL_258259", "EPI_ISL_257705", "EPI_ISL_273640", "EPI_ISL_301448", "EPI_ISL_301526", 
                                 "EPI_ISL_400884", "EPI_ISL_321239", "EPI_ISL_396801", "EPI_ISL_198046", "EPI_ISL_198049", "EPI_ISL_267037", "EPI_ISL_267036", 
                                 "EPI_ISL_202663", "EPI_ISL_15146370", "EPI_ISL_197990", "EPI_ISL_202672", "EPI_ISL_198070", "EPI_ISL_198078", "EPI_ISL_220062", 
                                 "EPI_ISL_16706697", "EPI_ISL_267823", "EPI_ISL_380069", "EPI_ISL_364318", "EPI_ISL_363992", "EPI_ISL_232498", "EPI_ISL_215972",
                                 "EPI_ISL_297082", "EPI_ISL_213979", "EPI_ISL_207682", "EPI_ISL_225354", "EPI_ISL_297082", "EPI_ISL_263392", "EPI_ISL_209072",
                                 "EPI_ISL_220110", "EPI_ISL_202704", "EPI_ISL_312845", "EPI_ISL_378555", "EPI_ISL_4055854",
                                 "EPI_ISL_279062", "EPI_ISL_249373", "EPI_ISL_266002", "EPI_ISL_275707", "EPI_ISL_248413", "EPI_ISL_396301", "EPI_ISL_289717", 
                                 "EPI_ISL_241350", "EPI_ISL_249185", "EPI_ISL_499037", "EPI_ISL_396298", "EPI_ISL_291271", "EPI_ISL_239118", "EPI_ISL_289715",
                                 "EPI_ISL_269432", "EPI_ISL_403392", "EPI_ISL_289025", "EPI_ISL_289025", "EPI_ISL_274944", "EPI_ISL_267677", "EPI_ISL_376101",
                                 "EPI_ISL_280158", "EPI_ISL_266046", "EPI_ISL_290276", "EPI_ISL_283425", "EPI_ISL_248000", "EPI_ISL_289023", "EPI_ISL_289024", 
                                 "EPI_ISL_243998", "EPI_ISL_257383", "EPI_ISL_240553", "EPI_ISL_280778", "EPI_ISL_289716", "EPI_ISL_376100",
                                 "EPI_ISL_278538", "EPI_ISL_259009", "EPI_ISL_394022", "EPI_ISL_289544", "EPI_ISL_363118", "EPI_ISL_297935", "EPI_ISL_296470",
                                 "EPI_ISL_331251", "EPI_ISL_272732", "EPI_ISL_256055", "EPI_ISL_242498", "EPI_ISL_367054", "EPI_ISL_272733", "EPI_ISL_290598"
                                 "EPI_ISL_334507", "EPI_ISL_297921"

                                 ]
                         }

In [40]:

# mco = []
# if os.path.isfile(molecular_clock_outliers):
#     with open(molecular_clock_outliers, "r") as fr:
#         for l in fr:
#             if "reason" in l:
#                 continue
#             mco.append(l.strip("\n").split(","))

#make reverse list of clusters
reversed_clusters = {c:k for k,v in clusters.items() for c in v}

#clus_representatives_reversed = {i:f"{c}-{s}" for s, d in clus_representatives.items() for c, v in d.items() for i in  v}
season_clus_ids_reversed = {i:f"{c}-{s}" for s, d in season_clus_ids.items() for c, v in d.items() for i in  v}


for period, outliers in tempest_outliers.items():
    for outlier in outliers:
       
        if outlier in season_clus_ids_reversed.keys(): #else outlier is already removed
            clus, cson = season_clus_ids_reversed[outlier].split("-")
            cluster_season_ids = season_clus_ids[cson][clus]
            
            for cid in cluster_season_ids:
                l = [cid, "molecular clock outlier"]
                if not any(i==l for i in mco):
                    mco.append(l)     

        else:
            if outlier not in to_remove:
                for c, l in clusters.items():
                    if outlier in l:
                        if not any(i==[outlier, "molecular clock outlier"] for i in mco):
                            mco.append([outlier, "molecular clock outlier"])
            else:
                if not any(i==[outlier, "molecular clock outlier"] for i in mco):
                    mco.append([outlier, "molecular clock outlier"])            
        
mco = pd.DataFrame.from_records(mco, columns=["Isolate_Id","reason"])
mco.to_csv(molecular_clock_outliers, index=False)
    
    
