In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from pmdarima import AutoARIMA
#from pmdarima.datasets import load_wineind
import pmdarima as pm
import plotly.express as px 



sns.set()
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
import warnings
warnings.simplefilter('ignore')

In [2]:
## Loading url for collecting data


base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
confirmed_url = "time_series_covid19_confirmed_global.csv"
dead_url = "time_series_covid19_deaths_global.csv"
recovered_url = "time_series_covid19_recovered_global.csv"



In [3]:
## Creating database for patients

# Creating database for confirmed patients

df_confirmed = pd.read_csv(base_url + confirmed_url)
confirmed_copy = df_confirmed.copy()  # for time series file with provinces
#df_confirmed


# Creating database for dead patients

df_dead = pd.read_csv(base_url + dead_url)
dead_copy = df_dead.copy()  # for time series file with provinces
#df_dead

# Creating database for recovered patients

df_recovered = pd.read_csv(base_url + recovered_url)
recovered_copy = df_recovered.copy()  # for time series file with provinces
#df_recovered


In [4]:
to_concat = df_confirmed[df_confirmed["Province/State"].notna()]["Country/Region"].unique()
#df_confirmed = df_confirmed.drop(["Lat", "Long"], axis=1)


for country in to_concat:
    new_row = df_confirmed[df_confirmed["Country/Region"] == country].sum()
    new_row["Country/Region"] = country
    new_row["Province/State"] = np.NaN
    df_confirmed = df_confirmed.drop(df_confirmed[df_confirmed["Country/Region"] == country].index)
    df_confirmed = df_confirmed.append(new_row, ignore_index=True)

confirmed = df_confirmed[df_confirmed["Province/State"].isna()].drop("Province/State", axis=1)  # take only countries (no territories)
# print(to_concat)


In [5]:
to_concat = df_dead[df_dead["Province/State"].notna()]["Country/Region"].unique()
for country in to_concat:
    new_row = df_dead[df_dead["Country/Region"] == country].sum()
    new_row["Country/Region"] = country
    new_row["Province/State"] = np.NaN
    df_dead = df_dead.drop(df_dead[df_dead["Country/Region"] == country].index)
    df_dead = df_dead.append(new_row, ignore_index=True)

dead = df_dead[df_dead["Province/State"].isna()].drop("Province/State", axis=1)  # take only countries (no territories)
#dead

In [6]:
to_concat = df_recovered[df_recovered["Province/State"].notna()]["Country/Region"].unique()
for country in to_concat:
    # print(df[df['Country/Region']==country])
    new_row = df_recovered[df_recovered["Country/Region"] == country].sum()
    new_row["Country/Region"] = country
    new_row["Province/State"] = np.NaN
    df_recovered = df_recovered.drop(df_recovered[df_recovered["Country/Region"] == country].index)
    df_recovered = df_recovered.append(new_row, ignore_index=True)

recovered = df_recovered[df_recovered["Province/State"].isna()].drop("Province/State", axis=1)  # take only countries (no territories)
# print(to_concat)

In [7]:
confirmed = confirmed.melt(id_vars = ['Country/Region', 'Lat', 'Long'], value_name = 'confirmed' , var_name = 'Date')
confirmed['Date'] = pd.to_datetime(confirmed['Date'], format = '%m/%d/%y')
confirmed.sort_values(by=['Country/Region', 'Date'], inplace = True)

confirmed

Unnamed: 0,Country/Region,Lat,Long,Date,confirmed
0,Afghanistan,33.939110,67.709953,2020-01-22,0
190,Afghanistan,33.939110,67.709953,2020-01-23,0
380,Afghanistan,33.939110,67.709953,2020-01-24,0
570,Afghanistan,33.939110,67.709953,2020-01-25,0
760,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
54712,Zimbabwe,-19.015438,29.154857,2020-11-04,8427
54902,Zimbabwe,-19.015438,29.154857,2020-11-05,8444
55092,Zimbabwe,-19.015438,29.154857,2020-11-06,8471
55282,Zimbabwe,-19.015438,29.154857,2020-11-07,8498


In [8]:
confirmed.reset_index(inplace =True, drop=True)

confirmed

Unnamed: 0,Country/Region,Lat,Long,Date,confirmed
0,Afghanistan,33.939110,67.709953,2020-01-22,0
1,Afghanistan,33.939110,67.709953,2020-01-23,0
2,Afghanistan,33.939110,67.709953,2020-01-24,0
3,Afghanistan,33.939110,67.709953,2020-01-25,0
4,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
55475,Zimbabwe,-19.015438,29.154857,2020-11-04,8427
55476,Zimbabwe,-19.015438,29.154857,2020-11-05,8444
55477,Zimbabwe,-19.015438,29.154857,2020-11-06,8471
55478,Zimbabwe,-19.015438,29.154857,2020-11-07,8498


In [9]:
dead = dead.melt(id_vars = ['Country/Region', 'Lat', 'Long'], value_name = 'Deaths' , var_name = 'Date')
dead['Date'] = pd.to_datetime(dead['Date'], format = '%m/%d/%y')
dead.sort_values(by=['Country/Region', 'Date'], inplace = True)

dead

Unnamed: 0,Country/Region,Lat,Long,Date,Deaths
0,Afghanistan,33.939110,67.709953,2020-01-22,0
190,Afghanistan,33.939110,67.709953,2020-01-23,0
380,Afghanistan,33.939110,67.709953,2020-01-24,0
570,Afghanistan,33.939110,67.709953,2020-01-25,0
760,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
54712,Zimbabwe,-19.015438,29.154857,2020-11-04,248
54902,Zimbabwe,-19.015438,29.154857,2020-11-05,248
55092,Zimbabwe,-19.015438,29.154857,2020-11-06,250
55282,Zimbabwe,-19.015438,29.154857,2020-11-07,251


In [10]:
dead.reset_index(inplace =True, drop=True)

dead

Unnamed: 0,Country/Region,Lat,Long,Date,Deaths
0,Afghanistan,33.939110,67.709953,2020-01-22,0
1,Afghanistan,33.939110,67.709953,2020-01-23,0
2,Afghanistan,33.939110,67.709953,2020-01-24,0
3,Afghanistan,33.939110,67.709953,2020-01-25,0
4,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
55475,Zimbabwe,-19.015438,29.154857,2020-11-04,248
55476,Zimbabwe,-19.015438,29.154857,2020-11-05,248
55477,Zimbabwe,-19.015438,29.154857,2020-11-06,250
55478,Zimbabwe,-19.015438,29.154857,2020-11-07,251


In [11]:
recovered = recovered.melt(id_vars = ['Country/Region', 'Lat', 'Long'], value_name = 'recovered' , var_name = 'Date')
recovered['Date'] = pd.to_datetime(recovered['Date'], format = '%m/%d/%y')
recovered.sort_values(by=['Country/Region', 'Date'], inplace = True)

recovered

Unnamed: 0,Country/Region,Lat,Long,Date,recovered
0,Afghanistan,33.939110,67.709953,2020-01-22,0
190,Afghanistan,33.939110,67.709953,2020-01-23,0
380,Afghanistan,33.939110,67.709953,2020-01-24,0
570,Afghanistan,33.939110,67.709953,2020-01-25,0
760,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
54713,Zimbabwe,-19.015438,29.154857,2020-11-04,7967
54903,Zimbabwe,-19.015438,29.154857,2020-11-05,7975
55093,Zimbabwe,-19.015438,29.154857,2020-11-06,7983
55283,Zimbabwe,-19.015438,29.154857,2020-11-07,7995


In [12]:
recovered.reset_index(inplace =True, drop=True)

recovered

Unnamed: 0,Country/Region,Lat,Long,Date,recovered
0,Afghanistan,33.939110,67.709953,2020-01-22,0
1,Afghanistan,33.939110,67.709953,2020-01-23,0
2,Afghanistan,33.939110,67.709953,2020-01-24,0
3,Afghanistan,33.939110,67.709953,2020-01-25,0
4,Afghanistan,33.939110,67.709953,2020-01-26,0
...,...,...,...,...,...
55475,Zimbabwe,-19.015438,29.154857,2020-11-04,7967
55476,Zimbabwe,-19.015438,29.154857,2020-11-05,7975
55477,Zimbabwe,-19.015438,29.154857,2020-11-06,7983
55478,Zimbabwe,-19.015438,29.154857,2020-11-07,7995


In [13]:
Data_corona = confirmed
Data_corona['recovered']= recovered['recovered']
Data_corona['Deaths'] = dead['Deaths']
Data_corona=Data_corona.rename(columns={"Country/Region": "Country", "Lat":"Latitude", "Long":"Longitude"})

Data_corona

Unnamed: 0,Country,Latitude,Longitude,Date,confirmed,recovered,Deaths
0,Afghanistan,33.939110,67.709953,2020-01-22,0,0,0
1,Afghanistan,33.939110,67.709953,2020-01-23,0,0,0
2,Afghanistan,33.939110,67.709953,2020-01-24,0,0,0
3,Afghanistan,33.939110,67.709953,2020-01-25,0,0,0
4,Afghanistan,33.939110,67.709953,2020-01-26,0,0,0
...,...,...,...,...,...,...,...
55475,Zimbabwe,-19.015438,29.154857,2020-11-04,8427,7967,248
55476,Zimbabwe,-19.015438,29.154857,2020-11-05,8444,7975,248
55477,Zimbabwe,-19.015438,29.154857,2020-11-06,8471,7983,250
55478,Zimbabwe,-19.015438,29.154857,2020-11-07,8498,7995,251


In [14]:
Data_corona['active_cases']= Data_corona['confirmed']-Data_corona['recovered']
#Data_corona['new_deaths']= Data_corona['Deaths']-Data_corona['Deaths'].shift(1)
#Data_corona['new_cases']= Data_corona['confirmed']-Data_corona['confirmed'].shift(1)

Data_corona

Unnamed: 0,Country,Latitude,Longitude,Date,confirmed,recovered,Deaths,active_cases
0,Afghanistan,33.939110,67.709953,2020-01-22,0,0,0,0
1,Afghanistan,33.939110,67.709953,2020-01-23,0,0,0,0
2,Afghanistan,33.939110,67.709953,2020-01-24,0,0,0,0
3,Afghanistan,33.939110,67.709953,2020-01-25,0,0,0,0
4,Afghanistan,33.939110,67.709953,2020-01-26,0,0,0,0
...,...,...,...,...,...,...,...,...
55475,Zimbabwe,-19.015438,29.154857,2020-11-04,8427,7967,248,460
55476,Zimbabwe,-19.015438,29.154857,2020-11-05,8444,7975,248,469
55477,Zimbabwe,-19.015438,29.154857,2020-11-06,8471,7983,250,488
55478,Zimbabwe,-19.015438,29.154857,2020-11-07,8498,7995,251,503


In [15]:
Data_corona.to_csv("data/countries-aggregated.csv", index=False)

## Data for Germany

In [16]:
df_Germany = Data_corona[Data_corona['Country'] == 'Germany']
df_Germany


Unnamed: 0,Country,Latitude,Longitude,Date,confirmed,recovered,Deaths,active_cases
19272,Germany,51.165691,10.451526,2020-01-22,0,0,0,0
19273,Germany,51.165691,10.451526,2020-01-23,0,0,0,0
19274,Germany,51.165691,10.451526,2020-01-24,0,0,0,0
19275,Germany,51.165691,10.451526,2020-01-25,0,0,0,0
19276,Germany,51.165691,10.451526,2020-01-26,0,0,0,0
...,...,...,...,...,...,...,...,...
19559,Germany,51.165691,10.451526,2020-11-04,608611,384012,10949,224599
19560,Germany,51.165691,10.451526,2020-11-05,631172,394616,11110,236556
19561,Germany,51.165691,10.451526,2020-11-06,653992,405809,11240,248183
19562,Germany,51.165691,10.451526,2020-11-07,668114,413484,11306,254630


In [17]:
df_Germany['new_cases']= df_Germany['confirmed']-df_Germany['confirmed'].shift(1)
df_Germany['new_deaths']= df_Germany['Deaths']-df_Germany['Deaths'].shift(1)

df_Germany.dropna()

Unnamed: 0,Country,Latitude,Longitude,Date,confirmed,recovered,Deaths,active_cases,new_cases,new_deaths
19273,Germany,51.165691,10.451526,2020-01-23,0,0,0,0,0.0,0.0
19274,Germany,51.165691,10.451526,2020-01-24,0,0,0,0,0.0,0.0
19275,Germany,51.165691,10.451526,2020-01-25,0,0,0,0,0.0,0.0
19276,Germany,51.165691,10.451526,2020-01-26,0,0,0,0,0.0,0.0
19277,Germany,51.165691,10.451526,2020-01-27,1,0,0,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
19559,Germany,51.165691,10.451526,2020-11-04,608611,384012,10949,224599,31480.0,232.0
19560,Germany,51.165691,10.451526,2020-11-05,631172,394616,11110,236556,22561.0,161.0
19561,Germany,51.165691,10.451526,2020-11-06,653992,405809,11240,248183,22820.0,130.0
19562,Germany,51.165691,10.451526,2020-11-07,668114,413484,11306,254630,14122.0,66.0


In [20]:
df_Germany.to_csv("data/corona_Germany.csv", index=False)