In [54]:
from datetime import datetime
time_format = "%d%b%Y %H:%M"
datetime.now().strftime(time_format)

'13Dec2020 22:45'

In [55]:
import covsirphy as cs

In [56]:
import math
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
from matplotlib.ticker import ScalarFormatter
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sci

In [57]:
from collections import defaultdict
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import functools
from IPython.display import display, Markdown

In [58]:
import dask.dataframe as dd

pd.plotting.register_matplotlib_converters()

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import sympy as sym

In [60]:
np.random.seed(123)
os.environ["PYTHONHASHSEED"] = "123"
# Matplotlib
plt.style.use("seaborn-ticks")
plt.rcParams["xtick.direction"] = "in"
plt.rcParams["ytick.direction"] = "in"
plt.rcParams["font.size"] = 11.0
plt.rcParams["figure.figsize"] = (9, 6)
# Pandas
pd.set_option("display.max_colwidth", 1000)

# The number of cases
Yes, the main dataset is for the number of COVID-19 cases. This will be retrieved from COVID-19 Data Hub. To retrive this dataset, CovsirPhy uses covid19dh internally as discussed in GitHub issue#87 of COVID-19 Data Hub.

Citation:
Guidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.

In [61]:
# Create instance of covsirphy.DataLoader class
data_loader = cs.DataLoader(directory="kaggle/input")
# Retrieve the dataset of the number of COVID-19 cases
# Kaggle platform: covid19dh.csv will be saved in /output/kaggle/working/input
# Local env: covid19dg.cv will be saved in /input
jhu_data = data_loader.jhu()

Retrieving datasets from COVID-19 Data Hub: https://covid19datahub.io/

Please set verbose=2 to see the detailed citation list.




# Cleaned dataset (of all countries) was saved in jhu_data and we can get the cleaned dataset as a pandas.DataFrame.

In [62]:
jhu_data.cleaned().tail()

Unnamed: 0,Date,Country,Province,Confirmed,Infected,Fatal,Recovered
358099,2020-12-08,Colombia,Vichada,978,43,12,923
358100,2020-12-09,Colombia,Vichada,1047,80,12,955
358101,2020-12-10,Colombia,Vichada,1049,13,12,1024
358102,2020-12-11,Colombia,Vichada,1058,21,12,1025
358103,2020-12-12,Colombia,Vichada,1058,21,12,1025


In [63]:
jhu_data.subset("Japan", province=None).tail()

Unnamed: 0,Date,Confirmed,Infected,Fatal,Recovered
303,2020-12-08,166525,25114,2444,138967
304,2020-12-09,169333,26258,2486,140589
305,2020-12-10,172302,26820,2512,142970
306,2020-12-11,175100,27573,2553,144974
307,2020-12-12,178140,28077,2581,147482


In [64]:
jhu_data.subset("India", province=None).tail()

Unnamed: 0,Date,Confirmed,Infected,Fatal,Recovered
299,2020-12-08,9735941,380401,140807,9214733
300,2020-12-09,9767578,373932,141220,9252426
301,2020-12-10,9796989,365225,141631,9290133
302,2020-12-11,9826950,361252,142073,9323625
303,2020-12-12,9826950,361252,142073,9323625


In [65]:
df = jhu_data.cleaned()
jhu_first_date, jhu_last_date = df["Date"].min(), df["Date"].max()
jhu_elapsed = (jhu_last_date - jhu_first_date).days
print(f"{jhu_elapsed} days have passed from the date of the first record.")

346 days have passed from the date of the first record.


In [66]:
ind_data=jhu_data.subset("India", province=None)

In [67]:
ind_data.head()

Unnamed: 0,Date,Confirmed,Infected,Fatal,Recovered
0,2020-02-13,3,2,0,1
1,2020-02-14,3,2,0,1
2,2020-02-15,3,2,0,1
3,2020-02-16,3,1,0,2
4,2020-02-17,3,1,0,2


In [68]:
ind_data.tail()

Unnamed: 0,Date,Confirmed,Infected,Fatal,Recovered
299,2020-12-08,9735941,380401,140807,9214733
300,2020-12-09,9767578,373932,141220,9252426
301,2020-12-10,9796989,365225,141631,9290133
302,2020-12-11,9826950,361252,142073,9323625
303,2020-12-12,9826950,361252,142073,9323625


# We will replace COVID-19 Data Hub records in India with India-specific dataset because the first one is sometimes inconsitant with India government annouced data.

In [69]:
#Replacing records
jhu_data=ind_data.copy()


# Show the subset for Japan
jhu_data.tail()

Unnamed: 0,Date,Confirmed,Infected,Fatal,Recovered
299,2020-12-08,9735941,380401,140807,9214733
300,2020-12-09,9767578,373932,141220,9252426
301,2020-12-10,9796989,365225,141631,9290133
302,2020-12-11,9826950,361252,142073,9323625
303,2020-12-12,9826950,361252,142073,9323625


Total population
We need population values to calculate the number of "susceptible" cases.

"Susceptible" = "Total population" - "Confirmed" in SIR-F model (explained later!)

In [70]:
# We can use a method of cs.DataLoader()
population_data = data_loader.population()
# Show cleaned dataset
population_data.cleaned().tail()

Unnamed: 0,ISO3,Country,Province,Date,Population
301885,COL,Colombia,Vichada,2020-12-08,107808
301886,COL,Colombia,Vichada,2020-12-09,107808
301887,COL,Colombia,Vichada,2020-12-10,107808
301888,COL,Colombia,Vichada,2020-12-11,107808
301889,COL,Colombia,Vichada,2020-12-12,107808


In [71]:
population_data.cleaned().head(8)

Unnamed: 0,ISO3,Country,Province,Date,Population
0,AFG,Afghanistan,-,2020-01-01,37172386
1,AFG,Afghanistan,-,2020-01-02,37172386
2,AFG,Afghanistan,-,2020-01-03,37172386
3,AFG,Afghanistan,-,2020-01-04,37172386
4,AFG,Afghanistan,-,2020-01-05,37172386
5,AFG,Afghanistan,-,2020-01-06,37172386
6,AFG,Afghanistan,-,2020-01-07,37172386
7,AFG,Afghanistan,-,2020-01-08,37172386


Population pyramid
We will use population pyramid to estimate the number of days go out in average.

In [72]:
pyramid_csv_list = list()
for dirname, _, filenames in os.walk("/kaggle/input/population-pyramid-2019/"):
    for filename in filenames:
        name = os.path.join(dirname, filename)
        df = pd.read_csv(name)
        df["Country"], df["Year"], _ = filename.replace(".", "-").split("-")
        pyramid_csv_list.append(df)
pyramid_raw = pd.concat(pyramid_csv_list, sort=True)
pyramid_raw.head()

ValueError: No objects to concatenate

In [50]:
pyramid_csv_list = list()
for dirname, _, filenames in os.walk("/kaggle/input/population-pyramid-2019/"):
    for filename in filenames:
        name = os.path.join(dirname, filename)
        df = pd.read_csv(name)
        df["Country"], df["Year"], _ = filename.replace(".", "-").split("-")
        pyramid_csv_list.append(df)
pyramid_raw = pyramid_csv_list
print(pyramid_raw)

[]


In [51]:
population_data.tail()

AttributeError: 'PopulationData' object has no attribute 'tail'

In [52]:
population_data.cleaned().tail(8)

Unnamed: 0,ISO3,Country,Province,Date,Population
301882,COL,Colombia,Vichada,2020-12-05,107808
301883,COL,Colombia,Vichada,2020-12-06,107808
301884,COL,Colombia,Vichada,2020-12-07,107808
301885,COL,Colombia,Vichada,2020-12-08,107808
301886,COL,Colombia,Vichada,2020-12-09,107808
301887,COL,Colombia,Vichada,2020-12-10,107808
301888,COL,Colombia,Vichada,2020-12-11,107808
301889,COL,Colombia,Vichada,2020-12-12,107808


In [53]:
df = pyramid_raw.copy()
df["Country"] = df["Country"].replace(
    {
        "United States of America": "US",
        "United Kingdom": "UK",
    }
)
# Global (WORLD)
_male = [
    349432556, 342927576, 331497486, 316642222, 308286775, 306059387, 309236984,
    276447037, 249389688, 241232876, 222609691, 192215395, 157180267, 128939392,
    87185982, 54754941, 33648953, 15756942, 5327866, 1077791, 124144
]
_female = [
    328509234, 321511867, 309769906, 295553758, 289100903, 288632766, 296293748,
    268371754, 244399176, 238133281, 223162982, 195633743, 164961323, 140704320,
    101491347, 69026831, 48281201, 26429329, 11352182, 3055845, 449279
]
_df = pd.DataFrame(
    {
        "Age": df["Age"].unique(),
        "Country": "Global",
        "F": _female,
        "M": _male,
        "Year": 2019
    }
)
df = pd.concat([df, _df], axis=0, ignore_index=True, sort=True)
# Sweden
_male = [
    307116, 304759, 296771, 270840, 291723, 376952, 343311, 315086,
    312017, 336452, 342117, 306949, 279609, 265511, 273061, 195029,
    113166, 61775, 26170, 6768, 415
]
_female = [
    290553, 288817, 280944, 257677, 274760, 361526, 330153, 300752,
    301288, 327453, 331458, 300084, 280009, 272149, 286879, 212480,
    143654, 97633, 52624, 18130, 1771
]
_df = pd.DataFrame(
    {
        "Age": df["Age"].unique(),
        "Country": "Sweden",
        "F": _female,
        "M": _male,
        "Year": 2019
    }
)
df = pd.concat([df, _df], axis=0, ignore_index=True, sort=True)
# Philippines
_male = [
    5534962, 5820604, 5538414, 5383822, 5149849, 4710777, 4061897, 3581091, 3237426,
    2832825, 2482953, 2015857, 1556935, 1082875, 668107, 364200, 199400, 73508,
    17327, 3035, 208
]
_female = [
    5240508, 5541514, 5273495, 5029137, 4896316, 4589506, 3982681,
    3544279, 3191565, 2825286, 2521463, 2112380, 1714689, 1285782,
    895866, 567282, 360751, 155294, 57969, 13376, 1411
]
_df = pd.DataFrame(
    {
        "Age": df["Age"].unique(),
        "Country": "Philippines",
        "F": _female,
        "M": _male,
        "Year": 2019
    }
)
df = pd.concat([df, _df], axis=0, ignore_index=True, sort=True)
# Arrange
df["Population"] = df["F"] + df["M"]
df = df.pivot_table(
    index="Age", columns=["Country"], values="Population", aggfunc="last"
)
df = df.astype(np.int64).reset_index().rename({"Age": "Age_bin"}, axis=1)
series = df["Age_bin"].str.replace("+", "-122")
df[["Age_first", "Age_last"]] = series.str.split("-", expand=True).astype(np.int64)
df = df.drop("Age_bin", axis=1)
series = df["Age_last"]
df = df.apply(lambda x: x[:-2] / (x[-1] - x[-2] + 1), axis=1)
df["Age"] = series
df = pd.merge(df, pd.DataFrame({"Age": np.arange(0, 123, 1)}), on="Age", how="right", sort=True)
df = df.fillna(method="bfill").astype(np.int64)
df = df.set_index("Age")
pyramid_df = df.copy()
pyramid_df.loc[24:32]

TypeError: list indices must be integers or slices, not str