In [1]:
  #importing libraries
from sklearn.datasets import load_boston
import pandas as pd
import io
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

  import pandas.util.testing as tm


In [0]:
#Loading the dataset
from google.colab import files
uploaded = files.upload()
data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
median_age = pd.read_csv(io.StringIO(uploaded['median_age.csv'].decode('utf-8')))
population_density = pd.read_csv(io.StringIO(uploaded['population_density.csv'].decode('utf-8')))

Saving median_age.csv to median_age (1).csv
Saving population_density.csv to population_density (1).csv
Saving submission.csv to submission (1).csv
Saving test.csv to test (1).csv
Saving train.csv to train (1).csv


In [0]:
data.columns = ['Id', 'State', 'Country', 'Lat', 'Long', 'Date', 'ConfirmedCases', 'Fatalities']
"""median_age.columns = ['Country', 'Median_Age']
population_density.columns = ['Country', 'PopDensity']
print(data.columns)
print(median_age.columns)
print(population_density.columns)"""

Index(['Id', 'State', 'Country', 'Lat', 'Long', 'Date', 'ConfirmedCases',
       'Fatalities'],
      dtype='object')
Index(['Country', 'Median_Age'], dtype='object')
Index(['Country', 'PopDensity'], dtype='object')


In [0]:
#Merging the dataframes
"""pd.set_option('display.max_rows', 1000000)
country_data = data.groupby(['Country', 'Date'], as_index=False).agg({'ConfirmedCases':'sum', 'Fatalities':'sum'})
_temp = pd.merge(left=country_data, right=median_age, on='Country')
country_data_median_density = pd.merge(left=_temp, right=population_density, on='Country')
country_data_median_density.head()"""

In [0]:
# Auto correlation on confirmed cases
pd.set_option('display.max_rows', 1000000)
date_data_confirmedcases = data.groupby(['Date'], as_index=False).agg({'ConfirmedCases':'sum'})
date_data_confirmedcases['Date'] = pd.to_datetime(date_data_confirmedcases['Date'], format='%Y-%m-%d')

from pandas.plotting import autocorrelation_plot
autocorrelation_plot(date_data_confirmedcases.set_index('Date'))
plt.show()

In [0]:
#Running ARIMA model for confirmed cases
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(date_data_confirmedcases.set_index('Date'), order=(7,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())
#plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

In [0]:
# Forecasting through ARIMA on confirmed cases
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
X = date_data_confirmedcases.set_index('Date').values
size = int(len(X) * 0.83)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predications = list()
for t in range(len(test)):
  model = ARIMA(history, order=(10,1,0))
  model_fit = model.fit(disp=0)
  output = model_fit.forecast()
  yhat = output[0]
  predications.append(yhat)
  obs = test[t]
  history.append(obs)
  print('predicted=%f, observation=%f' % (yhat, obs))
error = mean_squared_error(test, predications)
print('Test MSE: %.3f' % error)
# plot
plt.plot(test)
plt.plot(predications, color='red')
plt.show()



In [0]:
# Forecasting for both confirmed cases and the fatalitites
df = data.groupby(['Date'], as_index=True).agg({'ConfirmedCases':'sum', 'Fatalities':'sum'})
#df = data.drop(labels=['Id', 'State', 'Country', 'Lat', 'Long', 'Date'], axis=1)

#df.index = data.Date
size = int(len(df) * 0.83)
train, test = df[0:size], df[size:len(df)]

#import and fit the multivariate ARIMA model using statsmodel library
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()

# make prediction
prediction = model_fit.forecast(model_fit.y, steps=len(test))
print(prediction)
print(test)
#cols = df.columns


In [0]:
# Group by query
data_country = data.groupby(['Country/Region', 'Date'], as_index=False).agg({'ConfirmedCases':'sum', 'Fatalities':'sum'})
import plotly.express as px
fig = px.bar(data_country[data_country['Country/Region'] == 'US'], x='Date', y='ConfirmedCases')
fig.show()

In [0]:
import plotly.graph_objects as go
import pandas as pd

fig = go.Figure()
fig.add_trace(go.Line(x=data.date, y=data['China'], name='China', line_color='deepskyblue'))

fig.add_trace(go.Line(x=data.date, y=data['Italy'], name='Italy', line_color='cornflowerblue'))

fig.update_layout(title_text='Coronavirus cases in China', xaxis_rangeslider_visible=True)

fig.show()

In [0]:
data_diff = data.copy(deep=True)
data_diff.iloc[:,1:] = data_diff.iloc[:,1:].diff()
#data_diff.iloc[:,1:].diff()
data_diff.fillna(value=0, inplace=True)
data_diff

In [0]:
import plotly.graph_objects as go
import pandas as pd

fig = go.Figure()
fig.add_trace(go.Line(x=data_diff.date, y=data_diff['China'], name='China', line_color='deepskyblue'))

fig.add_trace(go.Line(x=data_diff.date, y=data_diff['Italy'], name='Italy', line_color='cornflowerblue'))

fig.update_layout(title_text='Coronavirus cases in China', xaxis_rangeslider_visible=True)

fig.show()

In [5]:
#SIR model to predict the infected and the recovered over a time
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import matplotlib.pylab as plt
import plotly.graph_objects as go
import numpy as np
import inflect
p = inflect.engine()

def sir_model(N, I0, R0, S0, beta, gamma, days):
  S0 = N - I0 - R0
  S = [S0]
  I = [I0]
  R = [R0]
  
  for i in range(days):
    ds = S0 -  beta * ((S0 * I0) / N)
    di = (I0 +  beta * ((S0 * I0) / N)) - (gamma * I0)
    dr = R0 + (gamma * I0)
    S0 = ds
    I0 = di
    R0 = dr
    S.append(ds)
    I.append(di)
    R.append(dr)
  return (S, I, R)




def sir_interactive(beta, period, N, I0, R0):
  #N = 114200000
  #I0 = 1142
  #R0 = 225
  S0 = N - I0 - R0
  gamma = 1/14
  S, I, R = sir_model(N, I0, R0, S0, beta, gamma, period)
  print("Susceptible = ", int(S[-1]), "Infected = ", int(I[-1]), "Recovered = ", int(R[-1]))
  print(p.number_to_words(N))
  fig = plt.figure(figsize=(12,5), facecolor='w')
  ax = fig.add_subplot(111, axisbelow=True)
  ax.plot(range(period + 1), R, alpha=0.9, lw=2, label='Recovered')
  ax.plot(range(period + 1), S, alpha=0.9, lw=2, label='Susceptible')
  ax.plot(range(period + 1), I, alpha=0.9, lw=2, label='Infected')
  ax.set_xlabel('Time /days')
  ax.set_ylabel('Number (1000s)')
  ax.yaxis.set_tick_params(length=0)
  ax.xaxis.set_tick_params(length=0)
  #ax.grid(b=True, which='major', c='w', lw=2, ls='-')
  ax.legend()
  plt.grid()
  plt.show()

w = interactive(sir_interactive, beta=(0.1,1.5,0.01), period=(1, 360, 1), N=(10000000, 1000000000, 10000000), I0 =(1, 100000, 10), R0=(0, 100000, 10) )
w

interactive(children=(FloatSlider(value=0.8, description='beta', max=1.5, min=0.1, step=0.01), IntSlider(value…