In [111]:
import pandas as pd
import numpy as np
import os
import sklearn
#import swifter

In [112]:
#assign always returns a copy of the data, leaving the original DataFrame untouched.
#df.columns = [x.lower() for x in df.columns]

In [113]:
df = pd.read_csv("COVID19WW.csv")

# Treat NAN values

In [114]:
df.isnull().sum()

country                         0
country_code                    0
year_week                       0
source                          0
new_cases                      23
number_sequenced                0
percent_cases_sequenced        23
valid_denominator               0
variant                         0
number_detections_variant       0
percent_variant              4602
dtype: int64

### Fill the 'new_cases' with the mean of the previous and following week

In [115]:
#Here, we do not want to fill the NaN values from the 'new_cases' by 0 or mean of the whole column. The column contain
#the value of all countries and weeks. Instead i take the mean of the week before and after.

df_null = df[df['new_cases'].isna()]
df2 = pd.concat([df[['country','year_week','new_cases']].ffill(), df[['country','year_week','new_cases']].bfill()]).groupby(['country','year_week']).mean()
df2.reset_index(inplace = True)

In [116]:
df[df['percent_cases_sequenced'].isnull()]

Unnamed: 0,country,country_code,year_week,source,new_cases,number_sequenced,percent_cases_sequenced,valid_denominator,variant,number_detections_variant,percent_variant
24202,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.1.7,703,58.8
24203,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.1.7+E484K,0,0.0
24204,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.351,10,0.8
24205,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.427/B.1.429,1,0.1
24206,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.525,2,0.2
24207,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.526,0,0.0
24208,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.616,0,0.0
24209,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.617,0,0.0
24210,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.617.1,0,0.0
24211,Spain,ES,2021-08,GISAID,,1195,,Yes,B.1.617.2,0,0.0


#### Additional stats using the NaN treated

##### Now that we filled NaN, we compute the cumulative amount of cases per week and per country

In [142]:
df2["sum_newcases"] = df2.groupby(['country'])['new_cases'].apply(lambda x: x.cumsum())
df2[(df2.country=="Spain")]

Unnamed: 0,country,year_week,new_cases,sum_newcases
1176,Spain,2020-40,65146.0,65146.0
1177,Spain,2020-41,75556.0,140702.0
1178,Spain,2020-42,85481.0,226183.0
1179,Spain,2020-43,123871.0,350054.0
1180,Spain,2020-44,142377.0,492431.0
1181,Spain,2020-45,140521.0,632952.0
1182,Spain,2020-46,115646.0,748598.0
1183,Spain,2020-47,85752.0,834350.0
1184,Spain,2020-48,65571.0,899921.0
1185,Spain,2020-49,54141.0,954062.0


In [118]:
#[(df.country=="Spain") & (df.year_week == "2021-08")]

##### Create a temporate DF to merge with df2. We also compute the amount of number_sequenced per week and countries.

In [139]:
df_tmp = df.drop_duplicates(subset=['country', 'year_week'],keep='first').sort_index()
df_tmp.drop(['valid_denominator', 'variant', 'source','number_detections_variant', 'percent_variant','percent_cases_sequenced'], axis = 1,inplace = True)
df_tmp['sum_number_sequenced'] = df_tmp.groupby(['country'])['number_sequenced'].apply(lambda x: x.cumsum())
df_tmp['new_cases'] = df_tmp['new_cases'].fillna(value=0)
df_tmp[(df_tmp.country=="Spain")]

Unnamed: 0,country,country_code,year_week,new_cases,number_sequenced,sum_number_sequenced
23845,Spain,ES,2020-40,65146.0,259,259
23862,Spain,ES,2020-41,75556.0,242,501
23879,Spain,ES,2020-42,85481.0,378,879
23896,Spain,ES,2020-43,123871.0,217,1096
23913,Spain,ES,2020-44,142377.0,153,1249
23930,Spain,ES,2020-45,140521.0,194,1443
23947,Spain,ES,2020-46,115646.0,196,1639
23964,Spain,ES,2020-47,85752.0,198,1837
23981,Spain,ES,2020-48,65571.0,141,1978
23998,Spain,ES,2020-49,54141.0,313,2291


###### Now we merge we df2 (we need the sum_new_cases to compute the sum_percent)

In [147]:
df_sum = pd.merge(left=df2, right=df_tmp, left_on=['country','year_week'], right_on=['country','year_week'], how='left')
df_sum.drop(['new_cases_y'], axis = 1, inplace = True)
df_sum.rename(columns = {'new_cases_x':'new_cases'}, inplace = True)
df_sum['sum_pct'] = df_sum['sum_number_sequenced'] * 100 / df_sum['sum_newcases']
df_sum['pct_pw'] = df_sum['number_sequenced'] * 100 / df_sum['new_cases']
df_sum[(df_sum.country=="Spain")]

Unnamed: 0,country,year_week,new_cases,sum_newcases,country_code,number_sequenced,sum_number_sequenced,sum_pct,pct_pw
1176,Spain,2020-40,65146.0,65146.0,ES,259,259,0.397569,0.397569
1177,Spain,2020-41,75556.0,140702.0,ES,242,501,0.356072,0.320292
1178,Spain,2020-42,85481.0,226183.0,ES,378,879,0.388623,0.442204
1179,Spain,2020-43,123871.0,350054.0,ES,217,1096,0.313095,0.175182
1180,Spain,2020-44,142377.0,492431.0,ES,153,1249,0.25364,0.107461
1181,Spain,2020-45,140521.0,632952.0,ES,194,1443,0.227979,0.138058
1182,Spain,2020-46,115646.0,748598.0,ES,196,1639,0.218943,0.169483
1183,Spain,2020-47,85752.0,834350.0,ES,198,1837,0.220171,0.230898
1184,Spain,2020-48,65571.0,899921.0,ES,141,1978,0.219797,0.215034
1185,Spain,2020-49,54141.0,954062.0,ES,313,2291,0.240131,0.57812


In [None]:
#df.fillna(df.mean().astype(int))
#df.new_cases.mean().astype(int)