In [1]:
### This code enables to detect waves, compute variants distribution, heterogeneity and entropy

### Libraries and functions preparation

In [2]:
#Useful libraries
import pandas as pd
import numpy as np
import datetime
from scipy.signal import find_peaks, peak_widths, peak_prominences
from statsmodels.tsa.seasonal import STL 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

In [408]:
#Useful functions
def get_sunday(date_in):
    today = date_in
    next_sunday = today + datetime.timedelta(days=6-today.weekday(), weeks=0)
    return(next_sunday)

def get_region(x):
    if len(x)>2:
        return(x[2])
    else:
        return(None)

def remove_from_list(L):
    if " " in L:
        return(L.remove(" "))
    else:
        return(L)

def jaccard_distance(prot_1, prot_2):
    union = 0
    inter = 0
    n = len(prot_1)
    for i in range(n):
        if ((prot_1[i] == 1) or (prot_2[i] == 1)) :
            union+=1

    m = len(prot_1)
    for i in range(m):
        if ((prot_1[i] == 1) and (prot_2[i] == 1)) :
            inter +=1

    if union > 0:
        j_distance = 1-inter/union
    
    else: 
        j_distance = 0
        
    return(j_distance)

#This function identify waves and their relevant information in every country
def find_waves(country,ratio_start,ratio_between_waves,ratio_end, distance_peaks):
    country_data = covid_data[covid_data["location"] == country ]
    country_data["date"] = pd.to_datetime(country_data["date"])
    country_data["week_date"] = country_data["date"].apply(lambda x: get_sunday(x))
    country_data = country_data.fillna(0)
    cases_vaccinations = country_data.groupby("week_date")['new_cases_per_million', 'new_vaccinations'].sum().reset_index()

    res = STL(abs(cases_vaccinations['new_cases_per_million']),seasonal=13,period=4).fit()
    x_trend = abs(res.trend)
    n_series = len(x_trend)

    start_wave = 0
    end_wave = 0
    in_wave = False
    wave_list = []
    peak_list = []
    peak_ind_list = []

    max_peak = 100
    peak = 100

    trend = "increase"
    break_before = False
    break_val = 0
    break_ind = 0
    peak_ind = 0

    for i in range(2, n_series):
        
        if not(in_wave):
            if x_trend[i]>= min(100,max_peak/ratio_start):
                start_wave = i
                end_wave = i
                in_wave = True
                peak = x_trend[i]
                trend = "increase"
                peak_ind = i
        else:
            
            if x_trend[i] >= peak:
                trend = "increase"
                if not(break_before):
                    peak = x_trend[i]
                    max_peak = max(peak, max_peak)
                    peak_ind = i
                if break_before: 
                    stage = "2.1.1"
                    peak = x_trend[i]
                    max_peak = max(peak, max_peak)
                    peak_ind = i
                    decrease_val = peak-break_val
                    increase_val = x_trend[i]-break_val
                    if min(decrease_val, increase_val) >= max_peak/ratio_between_waves:
                            stage = "2.1.1.1"
                            end_wave = break_ind
                            wave_list += [[start_wave, end_wave]]
                            peak_list += [peak]
                            peak_ind_list += [peak_ind]
                            start_wave = break_ind
                            break_before = False
                            peak = 100


            if x_trend[i] < peak:
                if x_trend[i] <= max_peak/10:
                    end_wave = i
                    wave_list += [[start_wave, end_wave]]
                    peak_list += [peak]
                    peak_ind_list += [peak_ind]
                    in_wave = False
                    peak = 100
                    break_before = False

                if x_trend[i] >= x_trend[i-1]:
                    trend = "increase"
                    if break_before: 
                        decrease_val = peak-break_val
                        increase_val = x_trend[i]-break_val
                        if min(decrease_val, increase_val) >= max_peak/ratio_end:
                                end_wave = break_ind
                                wave_list += [[start_wave, end_wave]]
                                peak_ind_list += [peak_ind]
                                peak_list += [peak]
                                start_wave = break_ind
                                break_before = False
                                peak = 100

                    if x_trend[i-1]<=x_trend[i-2]:
                            break_val = x_trend[i-1]
                            break_ind  = i-1
                            break_before = True

    if in_wave:
        wave_list += [[start_wave, i]]
        peak_list += [peak]
        peak_ind_list += [i]

    n_waves = len(wave_list)
    ### Eliminating noisy waves, which last less then 10 weeks
    indices = [i for i in range(n_waves) if (wave_list[i][1]-wave_list[i][0]>=10)]
    print(wave_list)
    final_wave_list = [wave_list[i] for i in indices]
    final_peak_list = [peak_list[i] for i in indices]
    final_peak_ind_list = [peak_ind_list[i] for i in indices]

    start_wave_list = [w[0] for w in final_wave_list]
    end_wave_list = [w[1] for w in final_wave_list]
    start_wave_dates_list = [cases_vaccinations["week_date"].iloc[s] for s in start_wave_list]
    end_wave_dates_list = [cases_vaccinations["week_date"].iloc[e] for e in end_wave_list]
    peak_date_list = [cases_vaccinations["week_date"].iloc[i] for i in final_peak_ind_list]
    peak_height_list = [cases_vaccinations["new_cases_per_million"].iloc[i] for i in final_peak_ind_list]
    
    
    waves_df = pd.DataFrame(start_wave_dates_list, columns = ["start_peak_date"])
    waves_df["end_peak_date"] = end_wave_dates_list
    waves_df["peak_date"] = peak_date_list
    waves_df["peak_heights"] = peak_height_list
    waves_df["width"] = (pd.to_datetime(waves_df["end_peak_date"]) - pd.to_datetime(waves_df["start_peak_date"])).dt.days
    waves_df["country"] = country

    n_dates = len(start_wave_dates_list)
    time_from_prev = [1000]+[(end_wave_dates_list[i]-start_wave_dates_list[i-1]).days for i in range(1,n_dates)]
    waves_df["time_from_prev"] = time_from_prev
    peak_number = [0]
    for time in time_from_prev[1:]:
        if time < distance_peaks:
            peak_number += [peak_number[-1]]
        else:
            peak_number += [peak_number[-1]+1]
    waves_df["peak_number"] = peak_number
    waves_df_adj = waves_df.groupby("peak_number").agg({'peak_heights': 'max', "start_peak_date":"min",
                                         "end_peak_date":"max", "width":"sum"}).reset_index()
    waves_df_adj["country"] = country
    
    waves_df_adj_2 = pd.merge(waves_df_adj, waves_df[['start_peak_date', 'peak_heights',"peak_date"]], on = ['start_peak_date','peak_heights'])

    return(waves_df_adj_2)

#This function computes heterogeneity over a period of time
def compute_heterogeneity(protein_col):
    
    heterogeneity_list_all = [] 
    country_list = list(data_peaks_global["country"].unique())
    
    K = 0
    
    for country in country_list:
        data_peaks_country = data_peaks_global[data_peaks_global["country"] == country]
        country_variant_data = biological_evolution_country[biological_evolution_country["country"] == country]

        variants_list = list(country_variant_data["variant"])

        n_peaks = len(data_peaks_country)
        heterogeneity_country_all = []

        for k in range(n_peaks):
            n = min(10, len(data_peaks_country["dominant_variants"].iloc[k]))
            
            ### Previously dominant variant     
            prev_dom = data_peaks_country["previously_dominant_1"].iloc[k]
            
            ### All variants
            dominant_var_1 = data_peaks_country["dominant_variants"].iloc[k][:n]
            ratio_var_1 = data_peaks_country["dominant_ratios"].iloc[k][:n]

            dominant_var_2 = []
            ratio_var_2 = []

            for i in range(n):
                if dominant_var_1[i] in variants_list:
                    dominant_var_2 += [dominant_var_1[i]]
                    ratio_var_2 += [ratio_var_1[i]]
            
            if prev_dom == "No_dominant":
                previously_dom_values_common = [ 1 for i in protein_col]
                
            else:
                previously_dom_values = country_variant_data[(country_variant_data["variant"] == prev_dom)][protein_col].values[0]
                
            heterogeneity_all = 0

            n_var = len(dominant_var_2)
            R = 0
            for i in range(n_var-1):
                for j in range(i, n_var):
                    var_1 = dominant_var_2[i]
                    r_1 = ratio_var_2[i]
                    var_2 = dominant_var_2[j]
                    r_2 = ratio_var_2[j]
                    
                    var_info_protein_1 = country_variant_data[(country_variant_data["variant"] == var_1)][protein_col].values[0]
                    var_info_protein_2 = country_variant_data[(country_variant_data["variant"] == var_2)][protein_col].values[0]
                    
                    j_d_all = jaccard_distance(var_info_protein_1, var_info_protein_2) 
                    heterogeneity_all += r_1*r_2*j_d_all
                    
                    R += r_1*r_2
                    
            if R == 0:
                    heterogeneity_country_all += [0]
                    
            else:
                    heterogeneity_country_all += [heterogeneity_all/R]
        
        heterogeneity_list_all += heterogeneity_country_all

    return(heterogeneity_list_all)

#Function to compute entropy
def entropy_ratios(prop):
    ent = sum([-p*np.log(p) for p in prop if p!=0])
    return(ent)

### Variants data and cases

In [409]:
### Our World in Data cases
owid_path = "our_world_in_data/owid-covid-data-2022.03.21.csv"
covid_data = pd.read_csv(owid_path)
covid_data['new_cases'] = abs(covid_data['new_cases'])
covid_data['new_cases_per_million'] = abs(covid_data['new_cases_per_million'])
covid_data["date"] = pd.to_datetime(covid_data["date"])
covid_data["week_date"] = covid_data["date"].apply(lambda x: get_sunday(x))
covid_data = covid_data.fillna(0)
covid_data["location"] = covid_data["location"].replace("United States","USA")
covid_data["location"] = covid_data["location"].replace('Czechia', "Czech Republic")

In [410]:
### Cases and vaccination data
global_cases_vaccinations = covid_data.groupby(["week_date", "location"])['new_cases_per_million', 'new_vaccinations'].sum().reset_index()
global_cases_vaccinations.to_csv("our_world_in_data/global_cases_vaccinations.csv")


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [411]:
### Biological evolution data
n_weeks = 2
biological_evolution_country = pd.read_csv("generated_data/biological_evolution_country_"+str(n_weeks)+"weeks.csv").drop("Unnamed: 0", axis = 1)

list_var_col = ['variant','country','cases_first_derivative_0','cases_first_derivative_1', 'cases_first_derivative_2',
       'cases_mean_first_derivative', 'cases_week_0', 'cases_week_1','cases_week_2', 'cases_week_3', 'country', 'dominant_1', 'dominant_2',
       'm_second_derivative_cases', 'mean_cases_change', 'mean_ratio_change',
       'mean_ratio_first_derivative', 'mean_second_ratio_derivative','prop_dominant_1', 'prop_dominant_2', 'r_ratio_first_derivative_0',
       'r_ratio_first_derivative_1', 'r_ratio_first_derivative_2','ratio_week_0', 'ratio_week_1', 'ratio_week_2', 'ratio_week_3','total_current_cases', 'variants_entropy',
       'number_of_mutation', 'number_of_mutations', 'absolute_distance','jaccard_distance','current_new_cases_per_million',
       'current_total_vaccinations_per_hundred', 'current_stringency_index',"max_week_cases", "n_variants_past",
        "cancel_public_events",'restriction_gatherings','Date - start', 'Date - end', '', 'Unnamed: 0.1', 'heterogeneity']
all_col = biological_evolution_country.columns
bio_col = [c for c in all_col if (c not in list_var_col)]

### Non-structural protein
NSP_list = []
### N protein
N_list = []
### Spike protein
Spike_list = []
### M protein
M_list = []
### E protein
E_list = []

other_list = []

for col in bio_col:
    if col[0:3] == 'NSP' or col[0:2] == 'NS':
        NSP_list+=[col]
    elif col[0:5] == 'Spike':
        Spike_list += [col]
    elif col[0] == 'N':
        N_list += [col]
    elif col[0] == 'M':
        M_list += [col]
    elif col[0] == 'E':
        E_list += [col]
SP_list = N_list + M_list + E_list

NSP_prot_list = [prot.split("_")[0] for prot in NSP_list]
NSP_prot_list_new = []
for p in NSP_prot_list:
    if p == "NS3":
        p = "NSP3"
    if p == "NS7b":
        p = "NSP7"
    if p == "NS7a":
        p = "NSP7"
    if p == "NS6":
        p = "NSP6"
    if p == "NS8":
        p = "NSP8"
    NSP_prot_list_new += [p]
NSP_df = pd.DataFrame(NSP_list, columns = ["NSP mutation"])
NSP_df["NSP Protein"] = NSP_prot_list_new

### Identification of infection waves in every country

In [412]:
countries_list = ['United Kingdom', 'USA', 'Germany', 'Denmark', 'Canada', 'Japan',
       'France', 'Sweden', 'Switzerland', 'India', 'Brazil', 'Italy',
       'Spain', 'Netherlands', 'Turkey', 'Austria', 'Belgium',
        'Australia', 'Ireland', 'Mexico', 'Slovenia', 'Norway', 'Poland',
        'Israel', 'South Africa', 'Lithuania', 'Portugal', 'Finland','South Korea', 'Luxembourg']

In [413]:
#Identifying waves in every country of the list
country = countries_list[0]
data_peaks_global = find_waves(country,10,10,10, 120)
for c in countries_list[1:]:
    data_peaks = find_waves(c,10,10,10, 120)
    if len(data_peaks) >= 1:
        data_peaks_global = pd.concat((data_peaks_global, data_peaks), axis = 0)
data_peaks_global.to_csv("generated_data/data_peaks_global.csv")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

[[5, 23], [23, 44], [44, 60], [61, 62], [63, 64], [65, 66], [67, 79], [79, 85], [85, 93], [93, 108], [108, 111]]
[[7, 18], [18, 33], [33, 70], [71, 72], [73, 92], [92, 110], [111, 112]]
[[5, 17], [24, 55], [55, 71], [72, 73], [77, 87], [87, 99], [99, 111]]
[[4, 22], [22, 62], [62, 73], [73, 86], [86, 111]]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

[[7, 29], [29, 57], [57, 74], [78, 93], [93, 112]]
[[10, 15], [23, 35], [35, 58], [58, 74], [74, 88], [101, 112]]
[[5, 21], [25, 46], [46, 73], [74, 89], [89, 112]]
[[4, 31], [31, 53], [53, 72], [73, 74], [75, 88], [89, 109], [110, 111]]
[[2, 11], [19, 52], [52, 66], [67, 68], [69, 85], [85, 104], [104, 107]]
[[13, 53], [53, 85], [86, 87], [88, 89], [100, 108]]
[[3, 35], [35, 50], [50, 93], [94, 107]]



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

[[3, 18], [28, 53], [53, 70], [71, 72], [73, 88], [89, 111]]
[[4, 17], [23, 45], [45, 71], [71, 86], [87, 88], [89, 111]]
[[2, 17], [20, 38], [38, 50], [50, 68], [68, 81], [81, 94], [94, 107]]
[[2, 18], [18, 29], [29, 46], [46, 68], [68, 76], [76, 91], [91, 105]]
[[2, 11], [17, 49], [49, 65], [66, 67], [68, 69], [71, 82], [82, 95], [95, 103], [103, 107]]
[[2, 3], [4, 18], [22, 29], [29, 54], [54, 70], [71, 72], [73, 98], [98, 110]]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

[[7, 13], [22, 34], [75, 95], [95, 108], [108, 112]]
[[2, 13], [23, 40], [40, 67], [68, 81], [81, 107]]
[[13, 37], [37, 43], [43, 72], [72, 101], [101, 115]]
[[4, 14], [20, 26], [26, 46], [46, 57], [57, 72], [73, 74], [75, 98], [98, 111]]
[[2, 13], [23, 42], [42, 50], [50, 72], [72, 85], [85, 108]]
[[2, 8], [8, 17], [17, 26], [26, 48], [48, 63], [64, 65], [80, 95], [95, 106]]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

[[2, 12], [14, 38], [38, 58], [59, 60], [61, 62], [70, 88], [89, 91], [92, 108]]
[[8, 33], [33, 55], [56, 57], [58, 59], [60, 61], [62, 86], [87, 88], [93, 109], [110, 110]]
[[2, 17], [18, 54], [54, 67], [68, 69], [70, 94], [94, 107]]
[[2, 12], [12, 23], [23, 42], [42, 53], [54, 55], [56, 59], [60, 61], [62, 63], [64, 81], [82, 83], [84, 107]]
[[5, 19], [26, 48], [48, 71], [71, 85], [85, 89], [89, 108], [108, 111]]
[[4, 10], [28, 37], [37, 56], [56, 72], [72, 91], [91, 102], [102, 112]]
[[2, 11], [16, 26], [26, 48], [48, 66], [67, 68], [69, 75], [76, 104], [104, 107]]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [414]:
country_list = data_peaks_global["country"].unique()
cases_list = []
cases_list_to_peak = []

for c in country_list:
    data_peaks_country = data_peaks_global[data_peaks_global["country"] == c]
    country_cases_vaccinations = global_cases_vaccinations[global_cases_vaccinations["location"] == country ].reset_index()
    n_peaks = len(data_peaks_country)
    country_cases = []
    country_cases_to_peak = []
    
    for i in range(n_peaks):
        data_country_wave = data_peaks_country.iloc[i]
        start_peak_date = data_country_wave["start_peak_date"]
        end_peak_date = data_country_wave["end_peak_date"]
        peak_date = data_country_wave["peak_date"]
        all_cases = country_cases_vaccinations[(country_cases_vaccinations["week_date"]>= start_peak_date) & (country_cases_vaccinations["week_date"]<= end_peak_date)]["new_cases_per_million"].sum()
        cases_to_peak = country_cases_vaccinations[(country_cases_vaccinations["week_date"]>= start_peak_date) & (country_cases_vaccinations["week_date"]<= peak_date)]["new_cases_per_million"].sum()
        country_cases += [all_cases]
        country_cases_to_peak += [cases_to_peak]
    
    cases_list += country_cases
    cases_list_to_peak += country_cases_to_peak
    
data_peaks_global["wave_cases"] = cases_list
data_peaks_global["wave_cases_to_peak"] = cases_list_to_peak

In [415]:
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [416]:
n_peaks = len(data_peaks_global)
print("There are "+str(n_peaks)+ " waves detected in 30 countries.")

There are 144 waves detected in 30 countries.


In [417]:
n_peaks = len(finished_waves)
print("There are "+str(n_peaks)+ " waves detected in 30 countries.")

There are 115 waves detected in 30 countries.


In [418]:
peaks_per_country = data_peaks_global.groupby("country")["peak_number"].count().reset_index()
median_country = peaks_per_country["peak_number"].median()
q1_country = np.quantile(peaks_per_country["peak_number"], 0.25)
q3_country = np.quantile(peaks_per_country["peak_number"], 0.75)
print("The number of waves per country has a median of "+str(median_country)+ " quantile 1 " + str(q1_country)+ " quantile 3 " + str(q3_country) )

The number of waves per country has a median of 5.0 quantile 1 4.0 quantile 3 5.0


In [419]:
finished_waves['wave_lenght'] = (finished_waves['end_peak_date']-finished_waves['start_peak_date']).dt.days
median_country = finished_waves["wave_lenght"].median()
q1_country = np.quantile(finished_waves["wave_lenght"], 0.25)
q3_country = np.quantile(finished_waves["wave_lenght"], 0.75)
print("The wave length has a median of "+str(median_country)+ " quantile 1 " + str(q1_country)+ " quantile 3 " + str(q3_country) )

The wave length has a median of 119.0 quantile 1 94.5 quantile 3 154.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Variants distribution in every infection wave

In [421]:
### GISAID data
path_to_gisaid_metadata = # ADD PATH TO GISAID METADATA
variants = pd.read_csv(path_to_gisaid_metadata, sep='\t')
variants['Location_separated'] = variants['Location'].apply(lambda x: x.split(" / "))
variants['continent'] = variants['Location_separated'].apply(lambda x: x[0])
variants['country'] = variants['Location_separated'].apply(lambda x: x[1])
variants['region'] = variants['Location_separated'].apply(lambda x: get_region(x))
variants["Collection date"] = pd.to_datetime(variants["Collection date"])
variants["week_date"] = variants["Collection date"].apply(lambda x: get_sunday(x))
variants_unique = variants.drop_duplicates(subset = ["country", "Pango lineage"])
seq_per_country = variants.groupby("country")['Accession ID'].count().reset_index()
seq_per_country = seq_per_country.sort_values(by = 'Accession ID', ascending = False)
list_countries_oi = seq_per_country.head(30)["country"]


Columns (16) have mixed types.Specify dtype option on import or set low_memory=False.



In [422]:
c_sequences = variants
c_sequences['Collection date'] = pd.to_datetime(variants['Collection date'])
c_sequences_ev = c_sequences.groupby(['country','Collection date', 'Pango lineage'])['Accession ID'].count().reset_index()
c_sequences_ev = c_sequences_ev.rename(columns = {"Accession ID":"seq_count"})
c_sequences_ev = c_sequences_ev[c_sequences_ev["Collection date"]!="2020"]
c_sequences_ev["week_date"] = c_sequences_ev["Collection date"].apply(lambda x: get_sunday(x))

seq_count_per_week = c_sequences_ev.groupby(['country',"week_date","Pango lineage"])["seq_count"].sum().reset_index()
seq_count_per_week = seq_count_per_week.sort_values(by = ['country',"week_date"])

seq_count_per_week["country-week_date"] = seq_count_per_week["country"]+"//"+seq_count_per_week["week_date"].astype("str")
seq_count_per_week_pivot = seq_count_per_week.pivot(index="country-week_date", columns="Pango lineage", values="seq_count").reset_index().fillna(0)
seq_count_per_week_pivot["country"] = seq_count_per_week_pivot["country-week_date"].apply(lambda x:x.split("//")[0])
seq_count_per_week_pivot["week_date"] = seq_count_per_week_pivot["country-week_date"].apply(lambda x:x.split("//")[1])
seq_count_per_week_pivot = seq_count_per_week_pivot.drop("country-week_date", axis = 1)
seq_count_per_week_pivot["week_date"] = pd.to_datetime(seq_count_per_week_pivot["week_date"])

var_col = seq_count_per_week_pivot.columns[:-2]
seq_count_per_week_pivot["total_seq"] = seq_count_per_week_pivot[var_col].sum(axis = 1)
top_30_countries = seq_per_country.head(30)

In [528]:
seq_count_per_week_pivot.to_csv("generated_data/seq_count_per_week_pivot_global_2022_01_05.csv")

In [529]:
l_columns = seq_count_per_week_pivot.columns
l_columns = [ c for c in l_columns if c != "None"]
seq_count_per_week_pivot = seq_count_per_week_pivot[l_columns]

In [530]:
data_peaks_global["country"] = data_peaks_global["country"].replace("United States","USA")
data_peaks_global["country"] = data_peaks_global["country"].replace('Czechia', "Czech Republic")

In [531]:
country_list = list(data_peaks_global["country"].unique())
dominant_variants_list = []
dominant_ratios_list = []
n_variants_l = []
n_country = len(country_list)
for i in range(n_country):
    country = country_list[i]
    data_peaks_country = data_peaks_global[data_peaks_global["country"] == country]
    seq_count_per_week_country = seq_count_per_week_pivot[seq_count_per_week_pivot["country"] == country]
    n_peaks = len(data_peaks_country)
    for j in range(n_peaks):
        peak_data = data_peaks_country.iloc[j]
        start_peak_date = peak_data["start_peak_date"]
        end_peak_date = peak_data["end_peak_date"]
        seq_count_per_week_period = seq_count_per_week_country[(seq_count_per_week_country["week_date"]>=start_peak_date) & (seq_count_per_week_country["week_date"]<=end_peak_date) ]
        var_list = list(seq_count_per_week_period.columns[:-3])
        var_seq_count = seq_count_per_week_period[var_list].sum(axis = 0)
        tot_seq = sum(var_seq_count)
        var_seq_ratio = var_seq_count/tot_seq
        var_seq_ratio = var_seq_ratio.sort_values(ascending = False).reset_index().rename(columns = {0:"ratio"})
        ratio_l = list(var_seq_ratio["ratio"])
        ratio_l = [round(r,3) for r in ratio_l]
        dominant_variants_list += [list(var_seq_ratio["Pango lineage"])]
        dominant_ratios_list += [ratio_l]
        var_seq_ratio = var_seq_ratio[(var_seq_ratio["ratio"]>0.00001) & (var_seq_ratio["Pango lineage"]!= "None")]
        n_variants_l += [len(var_seq_ratio)]

In [532]:
data_peaks_global["n_variants"] = n_variants_l

In [533]:
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [534]:
median = np.quantile(finished_waves["n_variants"], 0.5)
q_1 = np.quantile(finished_waves["n_variants"], 0.25)
q_2 = np.quantile(finished_waves["n_variants"], 0.75)
print("The median is "+str(median)+" 25th quantile "+ str(q_1)+" 75th quantile "+str(q_2))

The median is 102.0 25th quantile 59.0 75th quantile 165.5


In [535]:
data_peaks_global["dominant_variants"] = dominant_variants_list
data_peaks_global["dominant_ratios"] = dominant_ratios_list
data_peaks_global["n_variants"] = n_variants_l
data_peaks_global["1st_dominant_variant"] = data_peaks_global["dominant_variants"].apply(lambda x:x[0])

### Distribution of mutations

In [585]:
n_weeks = 2
country_variants = pd.read_csv("generated_data/data_indep_dep_"+str(n_weeks)+"_weeks.csv")

import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

fig = go.Figure()

fig.add_trace(go.Box(y=country_variants['Spike_mutations'], name = "Spike protein", showlegend=False))
fig.add_trace(go.Box(y=country_variants[['N_mutations']].sum(axis = 1), name = "N protein", showlegend=False))
fig.add_trace(go.Box(y=country_variants[['E_mutations']].sum(axis = 1), name = "E protein", showlegend=False))
fig.add_trace(go.Box(y=country_variants[['M_mutations']].sum(axis = 1), name = "M protein", showlegend=False))
fig.add_trace(go.Box(y=country_variants[['NSP_mutations']].sum(axis = 1), name = "NSP proteins", showlegend=False))

fig.update_layout(width=850,
                  height=650,
                  yaxis_title="Number of mutations",
title={
    'text': "Number of mutations per viral proteins in each variant-country",
    'x':0.4,
    'xanchor': 'center',
    'yanchor': 'top'})
    
fig.show()

### Identifying waves with similar variant in successive waves

In [538]:
selected_peaks = finished_waves 

In [539]:
dom_equal_list = []
dom_equal_second_list = []
country_list = data_peaks_global["country"].unique()

for country in country_list:
    data_peaks_country = data_peaks_global[data_peaks_global["country"] == country]
    dominant_variants_list = list(data_peaks_country["dominant_variants"])
    n_peaks = len(dominant_variants_list)
    dom_equal = [0]
    dom_equal_sec = [0]
    for i in range(n_peaks-1):
        if dominant_variants_list[i+1][0]==dominant_variants_list[i][0]:
            dom_equal+=[1]
        else:
            dom_equal+=[0]
        if (dominant_variants_list[i+1][0]==dominant_variants_list[i][1]):
            dom_equal_sec+=[1]
        else:
            dom_equal_sec+=[0]

    dom_equal_list +=dom_equal
    dom_equal_second_list += dom_equal_sec

In [540]:
data_peaks_global["dom_equal_prev"] = dom_equal_list
data_peaks_global["dom_equal_next"] = dom_equal_list[1:]+[0]
data_peaks_global["dom_equal_second"] = dom_equal_second_list

In [541]:
data_peaks_global["dom_equal_prev_bool"] = data_peaks_global["dom_equal_prev"] == 1
data_peaks_global["dom_equal_next_bool"] = data_peaks_global["dom_equal_next"] == 1

In [542]:
country_list = data_peaks_global["country"].unique()
prev_ratio_all = []
for country in country_list:
    data_peaks_country = data_peaks_global[data_peaks_global["country"] == country]
    prev_ratio_country = [0]
    dominant_variants_list = list(data_peaks_country["dominant_variants"])
    n_peaks = len(dominant_variants_list)

    for i in range(1,n_peaks):
        loc_data = data_peaks_country.iloc[i]
        dom = dominant_variants_list[i][0]
        j = 0 
        n_list = len(dominant_variants_list[i-1])
        while dominant_variants_list[i-1][j] != dom and j <= n_list:
                j+=1
        prev_ratio = data_peaks_country.iloc[i-1]["dominant_ratios"][j]
        prev_ratio_country += [prev_ratio]
    prev_ratio_all += prev_ratio_country

In [543]:
data_peaks_global["previous_ratio"] = prev_ratio_all
data_peaks_global["current_ratio_dom"] = data_peaks_global["dominant_ratios"].apply(lambda x:x[0])

In [544]:
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [546]:
similar_dominant_variant = finished_waves[ finished_waves["dom_equal_prev_bool"]][['country', 'start_peak_date', 'peak_date',"end_peak_date", "current_ratio_dom", "previous_ratio", "dominant_variants"]]

In [631]:
similar_table = similar_dominant_variant[similar_dominant_variant["end_peak_date"] <= "2022-03-05"]

Unnamed: 0,country,start_peak_date,peak_date,end_peak_date,current_ratio_dom,previous_ratio,dominant_variants
6,USA,2020-05-31,2020-07-26,2020-09-13,0.244,0.538,"[B.1, B.1.2, B.1.1, B.1.369, B.1.240, B.1.595,..."
12,Germany,2021-02-21,2021-04-25,2021-06-13,0.838,0.243,"[B.1.1.7, B.1.351, B.1.177.86, B.1.177.81, B.1..."
18,Denmark,2021-04-11,2021-05-30,2021-06-27,0.958,0.317,"[B.1.1.7, B.1.177.46, B.1.1.519, AY.4, AY.122,..."
38,Sweden,2021-02-07,2021-04-18,2021-06-20,0.824,0.184,"[B.1.1.7, B.1.351, B.1.177, B.1.177.82, B.1.16..."
42,Switzerland,2021-02-28,2021-04-18,2021-06-06,0.891,0.156,"[B.1.1.7, B.1.214.2, C.36.3, P.1, B.1.351, B.1..."
64,Netherlands,2021-02-14,2021-04-25,2021-06-20,0.862,0.215,"[B.1.1.7, B.1.351, B.1.221, B.1.177.81, B.1.17..."
69,Turkey,2020-07-19,2020-10-25,2020-10-04,0.429,0.42,"[B.1.1, B.1, B.1.1.161, B.1.218, B.1.9.5, B.1...."
80,Belgium,2021-02-21,2021-03-28,2021-06-13,0.758,0.308,"[B.1.1.7, P.1.16, B.1.351, B.1.214.2, B.1.160,..."
97,Slovenia,2020-12-20,2021-01-17,2021-03-07,0.759,0.404,"[B.1.258.17, B.1.1.7, B.1.258, B.1.160, B.1.1...."
125,Portugal,2020-05-24,2020-07-05,2020-08-09,0.43,0.422,"[B.1.1, B.1, B.1.1.421, B.1.1.410, B.1.1.269, ..."


In [643]:
data_peaks_global["1st_dominant_ratio"] = data_peaks_global["dominant_ratios"].apply(lambda x: x[0])
data_peaks_global["2nd_dominant_ratio"] = data_peaks_global["dominant_ratios"].apply(lambda x: x[1])
data_peaks_global["3rd_dominant_ratio"] = data_peaks_global["dominant_ratios"].apply(lambda x: x[2])

data_peaks_global["1st_dominant_count"] = data_peaks_global["1st_dominant_ratio"]*data_peaks_global["wave_cases"]
data_peaks_global["2nd_dominant_count"] = data_peaks_global["2nd_dominant_ratio"]*data_peaks_global["wave_cases"]
data_peaks_global["3rd_dominant_count"] = data_peaks_global["3rd_dominant_ratio"]*data_peaks_global["wave_cases"]

In [549]:
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [550]:
q1 = np.quantile(finished_waves["1st_dominant_count"].dropna(), 0.25)
med = np.quantile(finished_waves["1st_dominant_count"].dropna(), 0.5)
q3 = np.quantile(finished_waves["1st_dominant_count"].dropna(), 0.75)
print("1st median: "+str(med) + " q1: "+str(q1) + "q3: "+str(q3))

1st median: 9750.649992 q1: 3264.0977984999995q3: 19714.445753499996


In [551]:
q1 = np.quantile(finished_waves["2nd_dominant_count"].dropna(), 0.25)
med = np.quantile(finished_waves["2nd_dominant_count"].dropna(), 0.5)
q3 = np.quantile(finished_waves["2nd_dominant_count"].dropna(), 0.75)
print("2nd median: "+str(med) + " q1: "+str(q1) + "q3: "+str(q3))

2nd median: 3378.5334550000002 q1: 648.7157520000001q3: 6910.5404364999995


In [552]:
q1 = np.quantile(finished_waves["3rd_dominant_count"].dropna(), 0.25)
med = np.quantile(finished_waves["3rd_dominant_count"].dropna(), 0.5)
q3 = np.quantile(finished_waves["3rd_dominant_count"].dropna(), 0.75)
print("3RD median: "+str(med) + " q1: "+str(q1) + "q3: "+str(q3))

3RD median: 1838.7320300000001 q1: 353.154064q3: 4559.477078


### Distance between successive waves in every country

In [553]:
protein = bio_col

succ_jd_list = []
mutations_dom = []

country_list = list(biological_evolution_country["country"].unique())

for country in country_list:
    
    data_peaks_country = finished_waves[finished_waves["country"] == country]
    country_variant_data = biological_evolution_country[biological_evolution_country["country"] == country]
    variant_list = list(country_variant_data["variant"])
    
    n_peaks = len(data_peaks_country)

    succ_jd_list += [0]
    mutations_dom += [0]

    for i in range(1, n_peaks):
        dominant_variants_o = data_peaks_country.iloc[i-1]["dominant_variants"]
        dominant_ratio_o = data_peaks_country.iloc[i-1]['dominant_ratios']
        n_var = len(dominant_variants_o)
        dominant_variants = []
        dominant_ratios = []
        for k in range(0, min(11,n_var)) :
            var = dominant_variants_o[k]
            if var in variant_list:
                dominant_variants += [var]
                dominant_ratios += [dominant_ratio_o[k]]

        dom = data_peaks_country.iloc[i]["dominant_variants"][0]
        info_dom = country_variant_data[country_variant_data["variant"] == dom][bio_col]
        if len(info_dom) == 0:
            succ_jd_list +=[-1] #Dominant not found in the data base
        else:
            info_dom = info_dom.values[0]
        
            mutations_dom += [sum(info_dom)]

            succ_jd = 0
            int_jd = 0
            R = 0
            for k in range(len(dominant_variants)) :
                var = dominant_variants[k]
                info_var = country_variant_data[country_variant_data["variant"] == var][bio_col].values[0]
                r = dominant_ratios[k]
                succ_jd += r*jaccard_distance(info_dom,info_var)
                R += r
            if R == 0:
                succ_jd = 0
            else:
                succ_jd = succ_jd/R

            succ_jd_list +=[succ_jd]

In [554]:
info_dom = country_variant_data[country_variant_data["variant"] == dom]
len(info_dom)

1

In [555]:
succ_jd_list_no_zero = [j for j in succ_jd_list if j>0]
med = np.median(succ_jd_list_no_zero)
q1 = np.quantile(succ_jd_list_no_zero, 0.25)
q3 = np.quantile(succ_jd_list_no_zero, 0.75)
print("The jaccard distance between dom and variants in previous wave is, median: "+str(round(med,2))+" q1: "+str(round(q1,2))+" q3: "+str(round(q3,2)))

The jaccard distance between dom and variants in previous wave is, median: 0.9 q1: 0.78 q3: 0.95


### Distribution of top 3 dominant variants

In [556]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

fig = go.Figure()

y_val = list(finished_waves["dominant_ratios"].apply(lambda x: x[0]+x[1]+x[2]))
fig.add_trace(go.Box(y=y_val, name = "Sum of top 3 dominating variants", showlegend=False,  marker_color='#ef553b'))


y_val = list(finished_waves["dominant_ratios"].apply(lambda x: x[0]))
fig.add_trace(go.Box(y=y_val, name = "1st dominating variant", showlegend=False,  marker_color='#636EFA'))

y_val = list(finished_waves["dominant_ratios"].apply(lambda x: x[1]))
fig.add_trace(go.Box(y=y_val, name = "2nd dominating variant", showlegend=False,  marker_color='#FF7F0E'))

y_val = list(finished_waves["dominant_ratios"].apply(lambda x: x[2]))
fig.add_trace(go.Box(y=y_val, name ="3rd dominating variant", showlegend=False,  marker_color='#00CC96'))

fig.update_layout(width=1100,
                  height=650,yaxis_title="Ratio of variant in wave",
title={
    'text': "Ratios distribution of the top 3 dominating variants per wave",
    'x':0.47,
    'xanchor': 'center',
    'yanchor': 'top'})
    
fig.show()

In [557]:
ratios_dom_1 = finished_waves["dominant_ratios"].apply(lambda x: x[0])
ratios_dom_2 = finished_waves["dominant_ratios"].apply(lambda x: x[1])
ratios_dom_3 = finished_waves["dominant_ratios"].apply(lambda x: x[2])
combination = ratios_dom_1+ratios_dom_2+ratios_dom_3
median = combination.median()
q1 = np.quantile(combination, 0.25)
q3 = np.quantile(combination, 0.75)
print("Median "+str(median)+" 25% quantile "+str(q1) +" 75% quantile "+ str(q3))

Median 0.713 25% quantile 0.5095000000000001 75% quantile 0.8645


In [558]:
mean_dom_1 = finished_waves["dominant_ratios"].apply(lambda x: x[0]).median()
mean_dom_2 = finished_waves["dominant_ratios"].apply(lambda x: x[1]).median()
mean_dom_3 = finished_waves["dominant_ratios"].apply(lambda x: x[2]).median()
print("The dominant variant 1 has a median proportion of "+str(round(mean_dom_1*100,2))+" %")
print("The dominant variant 2 has a median proportion of "+str(round(mean_dom_2*100,2))+" %")
print("The dominant variant 3 has a median proportion of "+str(round(mean_dom_3*100,2))+" %")

The dominant variant 1 has a median proportion of 42.9 %
The dominant variant 2 has a median proportion of 13.7 %
The dominant variant 3 has a median proportion of 7.6 %


In [559]:
dominant_ratios_1 = finished_waves["dominant_ratios"].apply(lambda x: x[0])
dominant_ratios_2 = finished_waves["dominant_ratios"].apply(lambda x: x[1])
dominant_ratios_3 = finished_waves["dominant_ratios"].apply(lambda x: x[2])

In [560]:
import plotly.express as px

fig = go.Figure()


fig.add_trace(go.Scatter(
    x=dominant_ratios_1,
    y=dominant_ratios_2,showlegend=False,
    marker=dict(
        color=dominant_ratios_3,
        colorbar=dict(
            title="3rd dominating variant"
        ),
        colorscale="inferno"
    ),
    mode="markers"))


x_1 = [0,1]
y_1 = [1,0]

fig.add_trace(go.Scatter(x=x_1, y=y_1,showlegend=False,
                       mode='lines', line=dict(color='red')))

x_1 = [0,0.6]
y_1 = [0.6,0]

fig.add_trace(go.Scatter(x=x_1, y=y_1,showlegend=False,
                       mode='lines', line=dict(color='red')))

fig.update_layout(yaxis_title="2nd dominating variant",xaxis_title="1st dominating variant",    
                  width=910,
                  height=650,
                  title={
                    'text': "Ratios joint distribution of the top 3 dominating variants per wave",
                    'x':0.47,
                    'xanchor': 'center',
                    'yanchor': 'top'})
fig.show()

In [561]:
finished_waves["top_3_ratio"] = finished_waves["dominant_ratios"].apply(lambda x: x[0]+x[1]+x[2])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [562]:
sum(finished_waves["top_3_ratio"]>=0.60)

69

In [563]:
sum(finished_waves["top_3_ratio"]>=0.6)/len(finished_waves["top_3_ratio"]>=0.6)

0.6

### Wave heterogeneity 

In [564]:
country_list = data_peaks_global["country"].unique()
previously_dominant_1 = []
previously_dominant_2 = []
previously_dominant_3 = []

for country in country_list:
    data_country = data_peaks_global[data_peaks_global["country"] == country]
    
    dominant_variants_1 = list(data_country["dominant_variants"].apply(lambda x:x[0]))
    previously_dominant_1 += ["No_dominant"] + dominant_variants_1[1:]

    dominant_variants_2 = list(data_country["dominant_variants"].apply(lambda x:x[0]))
    previously_dominant_2 += ["No_dominant"] + dominant_variants_2[1:]

    dominant_variants_3 = list(data_country["dominant_variants"].apply(lambda x:x[0]))
    previously_dominant_3 += ["No_dominant"] + dominant_variants_3[1:]
    
data_peaks_global["previously_dominant_1"] = previously_dominant_1
data_peaks_global["previously_dominant_2"] = previously_dominant_2
data_peaks_global["previously_dominant_3"] = previously_dominant_3

In [565]:
heterogeneity_all = compute_heterogeneity(bio_col)
data_peaks_global["heterogeneity_all_all"] = heterogeneity_all

In [502]:
heterogeneity_spike = compute_heterogeneity(Spike_list)
data_peaks_global["heterogeneity_spike_all"] = heterogeneity_spike

In [503]:
heterogeneity_N = compute_heterogeneity(N_list)
data_peaks_global["heterogeneity_N_all"] = heterogeneity_N

In [504]:
heterogeneity_nsp = compute_heterogeneity(NSP_list)
data_peaks_global["heterogeneity_nsp_all"] = heterogeneity_nsp

In [505]:
heterogeneity_M = compute_heterogeneity(M_list)
data_peaks_global["heterogeneity_M_all"] = heterogeneity_M

In [506]:
heterogeneity_E = compute_heterogeneity(E_list)
data_peaks_global["heterogeneity_E_all"] = heterogeneity_E

In [507]:
data_peaks_global.to_csv("generated_data/data_peaks_global_heterogeneity.csv")

### Heterogeneity evolution with respect to vaccination campaigns

In [566]:
data_peaks_global["start_peak_date"] = pd.to_datetime(data_peaks_global["start_peak_date"])
data_peaks_global["end_peak_date"] = pd.to_datetime(data_peaks_global["end_peak_date"])
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [567]:
level_vaccinations = covid_data.groupby(["week_date", "location"])['people_vaccinated_per_hundred', "people_fully_vaccinated_per_hundred"].mean().reset_index()
level_vaccinations["location"] = level_vaccinations["location"].replace("United States","USA")
level_vaccinations = level_vaccinations.rename(columns = {"week_date":"start_vaccination", "location":"country"})
level_vaccinations = level_vaccinations.sort_values(by = ["country", "start_vaccination"])
start_vaccinations = level_vaccinations[level_vaccinations["people_vaccinated_per_hundred"]>=0.01]
group_start_vaccinations = start_vaccinations.groupby(["country"]).first().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [568]:
data_peaks_global_vacc_start = pd.merge(finished_waves, group_start_vaccinations, on = "country", how = "inner")

In [569]:
vac_val_list = []
country_list = list(data_peaks_global_vacc_start["country"].unique())
for country in country_list:
    data_peaks_country = data_peaks_global_vacc_start[data_peaks_global_vacc_start["country"] == country]
    level_vaccinations_country = level_vaccinations[level_vaccinations["country"] == country]
    n_peaks = len(data_peaks_country)
    for i in range(n_peaks):
        peak_info = data_peaks_country.iloc[i]
        start_date = peak_info['start_peak_date']
        end_date = peak_info['end_peak_date']
        level_vaccinations_peak = level_vaccinations_country[(level_vaccinations_country["start_vaccination"]>=start_date) & (level_vaccinations_country["start_vaccination"]<=end_date)]
        mean_vaccination = level_vaccinations_peak.groupby(["country"])["people_vaccinated_per_hundred"].mean().reset_index()
        vac_val = mean_vaccination["people_vaccinated_per_hundred"].values[0]
        vac_val_list += [vac_val]
data_peaks_global_vacc_start["mean_vaccination"] = vac_val_list

In [570]:
data_peaks_global_vacc_start["diff_from_start"] = (data_peaks_global_vacc_start["start_peak_date"]-data_peaks_global_vacc_start["start_vaccination"]).dt.days
data_peaks_global_vacc_start["wave_end_before_vax_start"] = (data_peaks_global_vacc_start["end_peak_date"]-data_peaks_global_vacc_start["start_vaccination"]).dt.days
data_peaks_global_vacc_start["before_vacc_start"] = (data_peaks_global_vacc_start["diff_from_start"]<=0)
data_peaks_global_vacc_start["after_vacc_start"] = (data_peaks_global_vacc_start["diff_from_start"]>=1)
data_peaks_global_vacc_start["end_before_vacc_start"] = (data_peaks_global_vacc_start["wave_end_before_vax_start"]<0)

In [571]:
level_vaccinations_week = level_vaccinations.rename(columns = {"start_vaccination":"week_date", "people_vaccinated_per_hundred":"people_at_least_one_dose_per_hundred"})

In [572]:
data_peaks_global_vacc_start["peak_date"] = pd.to_datetime(data_peaks_global_vacc_start["peak_date"])

In [573]:
data_peaks_global_vax = pd.merge(data_peaks_global_vacc_start, level_vaccinations_week[["week_date", "country","people_at_least_one_dose_per_hundred"]], left_on = ["peak_date", "country"], right_on = ["week_date", "country"])

In [623]:
finished_waves = data_peaks_global_vax[data_peaks_global_vax["end_peak_date"] <= "2022-01-05"]

before_1 = []
before_2 = []
before_3 = []

inside = []

after_2 = []
after_3 = []
after_1 = []

country_list = data_peaks_global_vacc_start["country"].unique()

protein = "all"  #Spike, N, NSP, N, M
subset = "all"

heterogeneity = "heterogeneity_"+protein+"_"+subset
vax_rate = []
time_to_vax = []

for country in country_list:
    data_country = finished_waves[finished_waves["country"] == country]

    list_end_before_vacc_start = list(data_country["end_before_vacc_start"])
    for i in range(len(list_end_before_vacc_start)):
        if list_end_before_vacc_start[i] == True:
            last_end = i


    if True in list(data_country["after_vacc_start"]):

        vax_index = list(data_country["after_vacc_start"]).index(True)
        list_values = list(data_country[heterogeneity])
        list_vax = list(data_country["people_at_least_one_dose_per_hundred"])
        time_to_vax += [list(data_country["diff_from_start"])[vax_index-1]]
        
        if vax_index == 1:
            after_1 += [list_values[vax_index]]
            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]]
            else:
                inside += [list_values[vax_index-1]]

            vax_rate += [list_values[vax_index]]

        if vax_index == 2:
            after_1 += [list_values[vax_index]] 

            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]]
                before_2 += [list_values[vax_index-2]]

            else:
                inside += [list_values[vax_index-1]]
                before_1 += [list_values[vax_index-2]] 

            vax_rate += [list_values[vax_index]]

        if vax_index == 3:
            after_1 += [list_values[vax_index]] 

            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]] 
                before_2 += [list_values[vax_index-2]]
                before_3 += [list_values[vax_index-3]]
            else:
                inside += [list_values[vax_index-1]] 
                before_1 += [list_values[vax_index-2]]
                before_2 += [list_values[vax_index-3]]

            vax_rate += [list_values[vax_index]]

        if vax_index == 4:
            after_1 += [list_values[vax_index]]
            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]] 
                before_2 += [list_values[vax_index-2]] 
                before_3 += [list_values[vax_index-3]]
            else:
                inside += [list_values[vax_index-1]] 
                before_1 += [list_values[vax_index-2]] 
                before_2 += [list_values[vax_index-3]] 
                before_3 += [list_values[vax_index-4]]

            vax_rate += [list_values[vax_index]]


        if len(list_values)>=vax_index+2:
            after_2 += [list_values[vax_index+1]]  

        if len(list_values)>=vax_index+3:
                    after_3 += [list_values[vax_index+2]]

before_waves = data_peaks_global_vacc_start[data_peaks_global_vacc_start['before_vacc_start']]
after_waves = data_peaks_global_vacc_start[data_peaks_global_vacc_start['after_vacc_start']]


import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

fig = go.Figure()

fig.add_trace(go.Box(y=before_2, name = "Before-2", showlegend = False))
fig.add_trace(go.Box(y=before_1, name = "Before-1", showlegend = False))
fig.add_trace(go.Box(y=inside, name = "During", showlegend = False))
fig.add_trace(go.Box(y=after_1, name = "After-1", showlegend = False))
fig.add_trace(go.Box(y=after_2, name = "After-2", showlegend = False))

fig.update_layout(yaxis_title="Heterogeneity",width=850,
                  height=650,
title={
    'text': "Heterogeneity evolution for "+ protein + " protein mutations",
    'x':0.4,
    'xanchor': 'center',
    'yanchor': 'top'})
    
fig.show()

In [629]:
median = np.quantile(time_to_vax, 0.5)
q1 = np.quantile(time_to_vax, 0.25)
q3 = np.quantile(time_to_vax, 0.75)
print("Time to vaccination in Transition waves: median = "+ str(median) +" q1 = "+ str(q1)+" q3 = "+ str(q3))

Time to vaccination in Transition waves: median = -108.5 q1 = -28.0 q3 = -154.0


In [599]:
stats.ttest_ind(after_2, after_1 )

Ttest_indResult(statistic=1.6782733144134891, pvalue=0.10090143019410674)

### Entropy distribution

In [600]:
# Entopy with the proportion of each variant
data_peaks_global_vax['wave_entropy'] = data_peaks_global_vax['dominant_ratios'].apply(lambda x:entropy_ratios(x))
data_peaks_global_vax.to_csv("generated_data/data_peaks_global_entropy.csv")

In [619]:
finished_waves = data_peaks_global_vax[data_peaks_global_vax["end_peak_date"] <= "2022-01-05"]

before_1 = []
before_2 = []
before_3 = []

inside = []

after_2 = []
after_3 = []
after_1 = []

timing_to_vax = []

country_list = data_peaks_global_vacc_start["country"].unique()

protein = "all"
subset = "all"

heterogeneity = "wave_entropy"
vax_rate = []

for country in country_list:
    data_country = finished_waves[finished_waves["country"] == country]

    list_end_before_vacc_start = list(data_country["end_before_vacc_start"])
    
    for i in range(len(list_end_before_vacc_start)):
        if list_end_before_vacc_start[i] == True:
            last_end = i


    if True in list(data_country["after_vacc_start"]):

        vax_index = list(data_country["after_vacc_start"]).index(True)
        list_values = list(data_country[heterogeneity])
        list_vax = list(data_country["people_at_least_one_dose_per_hundred"])

        
        if vax_index == 1:
            after_1 += [list_values[vax_index]]
            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]]
            else:
                inside += [list_values[vax_index-1]]
                
            vax_rate += [list_values[vax_index]]

        if vax_index == 2:
            after_1 += [list_values[vax_index]] 

            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]]
                before_2 += [list_values[vax_index-2]]

            else:
                inside += [list_values[vax_index-1]]
                before_1 += [list_values[vax_index-2]] 
                
            vax_rate += [list_values[vax_index]]

        if vax_index == 3:
            after_1 += [list_values[vax_index]] 

            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]] 
                before_2 += [list_values[vax_index-2]]
                before_3 += [list_values[vax_index-3]]
            else:
                inside += [list_values[vax_index-1]] 
                before_1 += [list_values[vax_index-2]]
                before_2 += [list_values[vax_index-3]]

            vax_rate += [list_values[vax_index]]

        if vax_index == 4:
            after_1 += [list_values[vax_index]]
            if vax_index-1 == last_end:
                before_1 += [list_values[vax_index-1]] 
                before_2 += [list_values[vax_index-2]] 
                before_3 += [list_values[vax_index-3]]
            else:
                inside += [list_values[vax_index-1]] 
                before_1 += [list_values[vax_index-2]] 
                before_2 += [list_values[vax_index-3]] 
                before_3 += [list_values[vax_index-4]]

            vax_rate += [list_values[vax_index]]


        if len(list_values)>=vax_index+2:
            after_2 += [list_values[vax_index+1]]  

        if len(list_values)>=vax_index+3:
                    after_3 += [list_values[vax_index+2]]

before_waves = data_peaks_global_vacc_start[data_peaks_global_vacc_start['before_vacc_start']]
after_waves = data_peaks_global_vacc_start[data_peaks_global_vacc_start['after_vacc_start']]

In [620]:
timing_to_vax

[]

In [602]:
import plotly.graph_objects as go
import numpy as np
np.random.seed(1)

fig = go.Figure()

fig.add_trace(go.Box(y=before_2, name = "Before-2", showlegend = False))
fig.add_trace(go.Box(y=before_1, name = "Before-1", showlegend = False))
fig.add_trace(go.Box(y=inside, name = "Transition", showlegend = False))
fig.add_trace(go.Box(y=after_1, name = "After-1", showlegend = False))
fig.add_trace(go.Box(y=after_2, name = "After-2", showlegend = False))


fig.update_layout(yaxis_title="Entropy",width=850,
                  height=650,
title={
    'text':"Entropy distribution evolution",
    'x':0.4,
    'xanchor': 'center',
    'yanchor': 'top'})
    
fig.show()