In [1]:
# Utility program to check the train daily data files for major errors.
# The John Hopkins data we used in the first week had daily reliability issues, mainly bad country reporting, 
# so it's worth checking the consistency of the data before passing it to the modelling sections.
# These are just some simple checks that flag suspicious entries, can modify the thresholds as needed.
# Needs to be followed by investigating the suspect dates and specific cleaning.

import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/covid19-global-forecasting-week-2/train.csv
/kaggle/input/covid19-global-forecasting-week-2/submission.csv
/kaggle/input/covid19-global-forecasting-week-2/test.csv


In [2]:
df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')

In [3]:
#prevent NA rows dissapear in grouby operations
df_train['Province_State'].fillna(' ',inplace=True)

In [4]:
by_ctry_prov = df_train.groupby(['Country_Region','Province_State'])[['ConfirmedCases','Fatalities']]
#calculate daily additions to confirmed cases and fatalities in absolute values and percentages
df_train[['NewCases','NewFatalities']]= by_ctry_prov.transform(lambda x: x.diff().fillna(0))
df_train[['NewCasesPct','NewFatalitiesPct']]= by_ctry_prov.transform(lambda x: x.pct_change().fillna(0))

In [5]:
df_train.sort_values('NewCases',ascending = False).head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
4052,6223,Hubei,China,2020-02-13,48206.0,1310.0,14840.0,242.0,0.444764,0.226592
13648,20964,,Spain,2020-03-25,49515.0,3647.0,9630.0,839.0,0.241444,0.298789
13649,20965,,Spain,2020-03-26,57786.0,4365.0,8271.0,718.0,0.16704,0.196874
16769,25765,New York,US,2020-03-26,37877.0,385.0,7036.0,100.0,0.228138,0.350877
7929,12165,,Germany,2020-03-26,43938.0,267.0,6615.0,61.0,0.177237,0.296117


In [6]:
#check for inconsistencies in daily new cases, cumulative count should only increase
df_train[df_train.NewCases < 0]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
694,1045,Northern Territory,Australia,2020-03-06,0.0,0.0,-1.0,0.0,-1.0,0.0
724,1110,Queensland,Australia,2020-01-31,2.0,0.0,-1.0,0.0,-0.333333,0.0
726,1112,Queensland,Australia,2020-02-02,2.0,0.0,-1.0,0.0,-0.333333,0.0
2403,3664,Alberta,Canada,2020-03-25,358.0,2.0,-1.0,1.0,-0.002786,1.0
3696,5657,Guizhou,China,2020-03-18,146.0,2.0,-1.0,0.0,-0.006803,0.0
7522,11548,Saint Barthelemy,France,2020-03-09,1.0,0.0,-2.0,0.0,-0.666667,0.0
8382,12863,,Guyana,2020-03-24,5.0,1.0,-15.0,0.0,-0.75,0.0
14677,22553,Alaska,US,2020-03-14,0.0,0.0,-1.0,0.0,-1.0,0.0
16501,25357,Nevada,US,2020-03-18,55.0,1.0,-1.0,0.0,-0.017857,0.0
17608,27059,Utah,US,2020-03-20,78.0,0.0,-2.0,0.0,-0.025,0.0


In [7]:
#check for inconsistencies in daily new fatalities, cumulative count should only increase
df_train[df_train.NewFatalities < 0]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
2920,4461,Quebec,Canada,2020-03-22,219.0,4.0,38.0,-1.0,0.209945,-0.2
8699,13355,,Iceland,2020-03-16,180.0,0.0,9.0,-5.0,0.052632,-1.0
8703,13359,,Iceland,2020-03-20,409.0,0.0,79.0,-1.0,0.239394,-1.0
8769,13460,,India,2020-03-21,330.0,4.0,86.0,-1.0,0.352459,-0.2
9419,14460,,Kazakhstan,2020-03-21,53.0,0.0,4.0,-3.0,0.081633,-1.0
12342,18958,,Philippines,2020-03-19,217.0,17.0,15.0,-2.0,0.074257,-0.105263
13194,20265,,Serbia,2020-03-26,384.0,1.0,0.0,-3.0,0.0,-0.75
13385,20561,,Slovakia,2020-03-22,185.0,0.0,7.0,-1.0,0.039326,-1.0
15403,23664,Hawaii,US,2020-03-25,91.0,0.0,1.0,-1.0,0.011111,-1.0


In [8]:
#more deaths than confirmed cases
df_train[df_train.Fatalities > df_train.ConfirmedCases]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct


In [9]:
#more than 40% increase in ConfirmedCases with at least 1000 new cases - Hubei 13 Feb example
df_train[(df_train.NewCasesPct > 0.4) & (df_train.NewCases > 1000)]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
4036,6207,Hubei,China,2020-01-28,3554.0,125.0,2131.0,49.0,1.49754,0.644737
4041,6212,Hubei,China,2020-02-02,11177.0,350.0,4024.0,101.0,0.562561,0.405622
4052,6223,Hubei,China,2020-02-13,48206.0,1310.0,14840.0,242.0,0.444764,0.226592
7656,11752,,France,2020-03-13,3661.0,79.0,1380.0,31.0,0.604998,0.645833
7659,11755,,France,2020-03-16,6633.0,148.0,2134.0,57.0,0.474328,0.626374
7916,12152,,Germany,2020-03-13,3675.0,7.0,1597.0,4.0,0.768527,1.333333
9098,13964,,Israel,2020-03-25,2369.0,5.0,1131.0,2.0,0.91357,0.666667
9151,14052,,Italy,2020-03-13,17660.0,1266.0,5198.0,439.0,0.417108,0.530834
13636,20952,,Spain,2020-03-13,5232.0,133.0,2955.0,78.0,1.29776,1.418182
14559,22365,,Turkey,2020-03-26,3629.0,75.0,1196.0,16.0,0.491574,0.271186


In [10]:
#more than 80% increase in ConfirmedCases with at least 50 new cases
df_train[(df_train.NewFatalitiesPct > 0.8) & (df_train.NewFatalities > 50)]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
7663,11759,,France,2020-03-20,12612.0,450.0,1741.0,207.0,0.160151,0.851852
13636,20952,,Spain,2020-03-13,5232.0,133.0,2955.0,78.0,1.29776,1.418182
16765,25761,New York,US,2020-03-22,15800.0,117.0,4090.0,57.0,0.349274,0.95
18712,28758,,United Kingdom,2020-03-19,2689.0,137.0,63.0,66.0,0.023991,0.929577


In [11]:
#example data cleaning for Hubei/China 13 Feb reporting
#replace day with 14K new cases caused bby measure change in China with average of near dates
maxindx = df_train.loc[(df_train.Country_Region=='China') & (df_train.Province_State=='Hubei'),:].NewCases.idxmax()
df_train.loc[maxindx-2:maxindx+2,:] #before fix of NewCases value for this day


Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
4050,6221,Hubei,China,2020-02-11,33366.0,1068.0,1638.0,94.0,0.051626,0.096509
4051,6222,Hubei,China,2020-02-12,33366.0,1068.0,0.0,0.0,0.0,0.0
4052,6223,Hubei,China,2020-02-13,48206.0,1310.0,14840.0,242.0,0.444764,0.226592
4053,6224,Hubei,China,2020-02-14,54406.0,1457.0,6200.0,147.0,0.128615,0.112214
4054,6225,Hubei,China,2020-02-15,56249.0,1596.0,1843.0,139.0,0.033875,0.095402


In [12]:
avg_smooth = (df_train.NewCases[maxindx-1]+df_train.NewCases[maxindx+1])/2
df_train.loc[maxindx,'NewCases']=avg_smooth
df_train.loc[maxindx-2:maxindx+2,:] #after fix of NewCases value for this day

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,NewCases,NewFatalities,NewCasesPct,NewFatalitiesPct
4050,6221,Hubei,China,2020-02-11,33366.0,1068.0,1638.0,94.0,0.051626,0.096509
4051,6222,Hubei,China,2020-02-12,33366.0,1068.0,0.0,0.0,0.0,0.0
4052,6223,Hubei,China,2020-02-13,48206.0,1310.0,3100.0,242.0,0.444764,0.226592
4053,6224,Hubei,China,2020-02-14,54406.0,1457.0,6200.0,147.0,0.128615,0.112214
4054,6225,Hubei,China,2020-02-15,56249.0,1596.0,1843.0,139.0,0.033875,0.095402
