In [1]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

In [2]:
n_weeks = 2
data_original = pd.read_csv("generated_data/data_indep_dep_"+str(n_weeks)+"_weeks.csv")
data_original = data_original[data_original["count_month_3"].isna() == False]
data_original = data_original[data_original["Date - end"]<"2022-01-01"]

### Sensitivity analysis of the proportion of variants above each count

In [3]:
ratio_list = []
for k in range(100, 2001, 100):
    data_original["more_than_"+str(k)] = data_original["count_month_3"]>=k
    ratio = sum(data_original["more_than_"+str(k)])/len(data_original)
    ratio_list += [ratio]

In [16]:
import plotly.graph_objects as go
import numpy as np


dep = [str(t) for t in range(100, 2001, 100)]
fig = go.Figure(data=go.Scatter(x=dep, y=ratio_list, mode='lines+markers'))

fig.update_layout(yaxis_title="Proportion of variants", xaxis_title="Threshold number of infections per million in 3 months",width=650,
                  height=550, title={
        'text': "Sensitivity analysis",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_yaxes(range=[0, 0.4])

fig.show()

### Distribution of the Covid-19 cases per million for variants in every country

In [17]:
data_original = data_original[data_original["count_month_3"].isna() == False]
data_original = data_original[data_original["Date - end"]<"2022-01-01"]

import plotly.express as px
import numpy as np

counts, bins = np.histogram(data_original.count_month_3, bins=range(0, 2500, 100))
densities = counts/sum(counts)

bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=densities, labels={'y':'Density', 'x':'Cases per million'} )

fig.update_layout( width=850,
                  height=650,
    title={
        'text': "Distribution of the Covid-19 cases per million",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()


### Distribution of Cases per million in 2nd week for variants causing more + 500 cases after 3 months

In [19]:
data_positive = data_original[data_original["count_month_3"]>500]
counts, bins = np.histogram(data_positive.cases_week_2, bins=range(0, 500, 20))
densities = counts/sum(counts)

bins = 0.5 * (bins[:-1] + bins[1:])
fig = px.bar(x=bins, y=densities, labels={'y':'Density'} )

fig.update_layout( width=850,
                  height=650,
    title={
        'text': "Cases per million in 2nd week for variants causing more + 500 cases after 3 months",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

### Number of cases in the second week of obervation when predicting risky variants

In [20]:
import plotly.express as px
df = px.data.iris() # iris is a pandas DataFrame

fig = px.scatter(data_original, x="cases_week_2", y="count_month_3")

fig.update_layout(
    title = "Number of cases in the second week of obervation when predicting risky variants ",
    xaxis_title = "Number of cases per Million in the 2nd week of observation",
    yaxis_title = "Number of cases per Million in the next 3 months")

fig.update_layout(width=850,height=650,
    title={
        'text': "Number of cases in the second week of obervation for top 12% infectious variants",
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})


fig.show()

In [22]:
med = data_original["cases_week_2"].median()
q1 = np.quantile(data_original["cases_week_2"], 0.25)
q3 = np.quantile(data_original["cases_week_2"], 0.75)
print("Cases after 2 weeks observation, median "+str(med)+" quantile 1 "+str(q1) + " quantile 3 "+str(q3))

Cases after 2 weeks observation, Median 2.5203433266064064 quantile 1 0.5655753368204943 quantile 3 10.637946044410162


In [23]:
data_positive = data_original[data_original["count_month_3"]>1000]
med = data_positive["cases_week_2"].median()
q1 = np.quantile(data_positive["cases_week_2"], 0.25)
q3 = np.quantile(data_positive["cases_week_2"], 0.75)
print("Cases after 3 months, median "+str(med)+" quantile 1 "+str(q1) + " quantile 3 "+str(q3))

Cases after 3 months, median 47.07193814580942 quantile 1 15.859767321785476 quantile 3 163.45439542682928


### Time between the detection of the dominant variant in a wave and its peak

In [24]:
data_peaks_global  = pd.read_csv("generated_data/data_peaks_global_dominant_variant.csv")
finished_waves = data_peaks_global[data_peaks_global["end_peak_date"] <= "2022-01-05"]

In [25]:
finished_waves_var_data = pd.merge(finished_waves, data_original, left_on = ["country", "1st_dominant_variant"], right_on = ["country", "variant"])

In [26]:
finished_waves_var_data["peak_date"] = pd.to_datetime(finished_waves_var_data["peak_date"])
finished_waves_var_data["Date - start"] = pd.to_datetime(finished_waves_var_data["Date - start"])

In [27]:
finished_waves_var_data["detection-to-peak"] = (finished_waves_var_data["peak_date"] - finished_waves_var_data["Date - start"] ).dt.days

In [28]:
med = finished_waves_var_data["detection-to-peak"].median()
q1 = np.quantile(finished_waves_var_data["detection-to-peak"], 0.25)
q3 = np.quantile(finished_waves_var_data["detection-to-peak"], 0.75)
print(" Time detection to peak, med: "+str(med)+" q1:"+str(q1)+" q3: "+str(q3))

 Time detection to peak, med: 63.0 q1:28.0 q3: 112.0


### Identification of the top 5 riskiest and less risky variants per country

In [39]:
test_df_2 = pd.read_csv("generated_data/test_df_"+str(2)+"_weeks.csv")
test_df_1 = pd.read_csv("generated_data/test_df_"+str(1)+"_weeks.csv")

results_2_weeks = test_df_2[["country", "variant", "Date - start", "Date - end", "cases_week_1", "cases_week_2", "count_month_3", "prediction_proba_2"]]
results_1_week = test_df_1[["country", "variant", "Date - start", "prediction_proba_1"]]

results_test = pd.merge(results_1_week, results_2_weeks, on =  ["country", "variant", "Date - start"])

In [40]:
selected_country = "United Kingdom"
country_prediction = results_test[(results_test["country"] == selected_country) & (results_test["cases_week_1"] >0) ]
top_5_riskiest = country_prediction.sort_values(by = "count_month_3", ascending =False).head(10)

In [42]:
top_5_less_risky = country_prediction.sort_values(by = "count_month_3", ascending = True).head(5)

### Data profile per country

In [29]:
n_weeks = 2
data_original = pd.read_csv("generated_data/data_indep_dep_"+str(n_weeks)+"_weeks.csv")

In [30]:
n_var_per_country = data_original.groupby("country")["variant"].count().reset_index().sort_values("variant", ascending = False)

In [None]:
path_to_gisaid_metadata = #ADD PATH TO GISAID METADATA
variants = pd.read_csv(path_to_gisaid_metadata, sep='\t')

In [38]:
top_30_variants = variants.groupby("country")["Accession ID"].count()
top_30_variants = top_30_variants.reset_index()
top_30_variants = top_30_variants.sort_values("Accession ID", ascending = False).head(50)

In [32]:
data_original["more_than_500"] = data_original["count_month_3"]>=500
data_original["more_than_1000"] = data_original["count_month_3"]>=1000

In [33]:
n_risky_per_country = data_original.groupby("country")[["more_than_500","more_than_1000"]].sum().reset_index()

In [34]:
var_per_countr = pd.merge(n_var_per_country,n_risky_per_country, on = "country", how = "right")

In [37]:
seq_var_per_countr = pd.merge(var_per_countr, top_30_variants, on = "country")
seq_var_per_countr = seq_var_per_countr.sort_values(by = "Accession ID", ascending = False)

In [None]:
q1 = np.quantile(seq_var_per_countr["variant"], 0.25)
q3 = np.quantile(seq_var_per_countr["variant"], 0.75)
med = seq_var_per_countr["variant"].median()
print("There is a median of "+ str(med)+" variant per country, 25% quantile: "+ str(q1)+" 75% quantile:"+str(q3))