# Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae

from datetime import timedelta

# Data Loading

In [2]:
dataset = pd.read_csv('covid.csv')
dataset.head()

Unnamed: 0,harian/key_as_string,harian/key,harian/doc_count,harian/jumlah_meninggal/value,harian/jumlah_sembuh/value,harian/jumlah_positif/value,harian/jumlah_dirawat/value,harian/jumlah_positif_kum/value,harian/jumlah_sembuh_kum/value,harian/jumlah_meninggal_kum/value,harian/jumlah_dirawat_kum/value
0,2020-03-02T00:00:00.000Z,1583107200000,1,0,0,2,2,2,0,0,2
1,2020-03-03T00:00:00.000Z,1583193600000,1,0,0,0,0,2,0,0,2
2,2020-03-04T00:00:00.000Z,1583280000000,1,0,0,0,0,2,0,0,2
3,2020-03-05T00:00:00.000Z,1583366400000,1,0,0,0,0,2,0,0,2
4,2020-03-06T00:00:00.000Z,1583452800000,1,0,0,2,2,4,0,0,4


Rename the columns for easier time processing the data

In [3]:
columns_name = {
    'harian/key_as_string': 'date',
    'harian/jumlah_meninggal/value': 'deaths',
    'harian/jumlah_sembuh/value': 'recovered',
    'harian/jumlah_positif/value': 'positive',
    'harian/jumlah_dirawat/value': 'hospitalized',
    'harian/jumlah_positif_kum/value': 'positive_cum',
    'harian/jumlah_sembuh_kum/value': 'recovered_cum',
    'harian/jumlah_meninggal_kum/value': 'deaths_cum',
    'harian/jumlah_dirawat_kum/value': 'hospitalized_cum',
}

data = dataset.rename(columns=columns_name).drop(columns=['harian/key', 'harian/doc_count'])

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].dt.strftime('%Y-%m-%d')

#create month
data['day'] = pd.DatetimeIndex(data['date']).day
data['month'] = pd.DatetimeIndex(data['date']).month
data['year'] = pd.DatetimeIndex(data['date']).year

data.head()

Unnamed: 0,date,deaths,recovered,positive,hospitalized,positive_cum,recovered_cum,deaths_cum,hospitalized_cum,day,month,year
0,2020-03-02,0,0,2,2,2,0,0,2,2,3,2020
1,2020-03-03,0,0,0,0,2,0,0,2,3,3,2020
2,2020-03-04,0,0,0,0,2,0,0,2,4,3,2020
3,2020-03-05,0,0,0,0,2,0,0,2,5,3,2020
4,2020-03-06,0,0,2,2,4,0,0,4,6,3,2020


Check for total number of rows and columns and missing values

In [4]:
print(f'The dataset has {data.shape[0]} rows and {data.shape[1]} columns')
print(f'The dataset has {data.isnull().sum().sum()} null values')

The dataset has 1000 rows and 12 columns
The dataset has 0 null values


Check for the data types of the columns

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1000 non-null   object
 1   deaths            1000 non-null   int64 
 2   recovered         1000 non-null   int64 
 3   positive          1000 non-null   int64 
 4   hospitalized      1000 non-null   int64 
 5   positive_cum      1000 non-null   int64 
 6   recovered_cum     1000 non-null   int64 
 7   deaths_cum        1000 non-null   int64 
 8   hospitalized_cum  1000 non-null   int64 
 9   day               1000 non-null   int64 
 10  month             1000 non-null   int64 
 11  year              1000 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 93.9+ KB


In [6]:
data.describe()

Unnamed: 0,deaths,recovered,positive,hospitalized,positive_cum,recovered_cum,deaths_cum,hospitalized_cum,day,month,year
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,159.641,6424.332,6646.093,62.12,3000363.0,2824214.0,83792.88,92355.812,15.701,6.64,2021.025
std,304.088464,9834.764572,10670.505976,6153.400169,2454453.0,2386757.0,65553.527747,125435.065064,8.780244,3.236182,0.796875
min,0.0,0.0,0.0,-29938.0,2.0,0.0,0.0,2.0,1.0,1.0,2020.0
25%,16.0,788.0,900.75,-681.75,432770.5,363489.0,14515.5,16388.0,8.0,4.0,2020.0
50%,64.5,3494.5,3367.0,79.5,2698424.0,2166888.0,69701.0,48433.0,16.0,7.0,2021.0
75%,164.0,6351.75,6278.25,702.25,5969048.0,5617216.0,153934.5,109546.5,23.0,9.0,2022.0
max,2069.0,61361.0,64718.0,39165.0,6646093.0,6424332.0,159641.0,586113.0,31.0,12.0,2022.0


# Data preprocessing

In [7]:
data['hospitalized'] = data['hospitalized'].apply(lambda x: 0 if x < 0 else x)

# Exploratory Data Analysis

## Positive Cases

In [8]:
monthly = data.groupby('month').agg(np.mean).reset_index()
yearly = data.groupby('year').sum().reset_index()

fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='positive').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='positive').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='positive').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Positive Cases")

fig.show()

The third wave of COVID-19 positive cases was caused by the Omicron variant which is more infectious than the previous variants.

## Recovered Cases

In [9]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='recovered').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='recovered').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='recovered').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Recovered Cases")

fig.show()

The third wave of COVID-19 recovered cases was higher because the Omicron variant was less lethal than the Delta variant ( the second wave ).

## Death Cases

In [10]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='deaths').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='deaths').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='deaths').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Deaths Cases")

fig.show()

The third wave of COVID-19 death cases was lower because the Omicron variant was less lethal than the Delta variant ( the second wave ).

## Hospitalized Cases

In [11]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='hospitalized').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='hospitalized').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='hospitalized').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Hospitalized Cases")

fig.show()

# Data Preparation

In [12]:
df = data.copy()
df = df[['date', 'positive']]
df.head()

Unnamed: 0,date,positive
0,2020-03-02,2
1,2020-03-03,0
2,2020-03-04,0
3,2020-03-05,0
4,2020-03-06,2


In [13]:
df['y'] = df['positive'].shift(-1)
df

Unnamed: 0,date,positive,y
0,2020-03-02,2,0.0
1,2020-03-03,0,0.0
2,2020-03-04,0,0.0
3,2020-03-05,0,2.0
4,2020-03-06,2,0.0
...,...,...,...
995,2022-11-22,7644,7221.0
996,2022-11-23,7221,7110.0
997,2022-11-24,7110,5976.0
998,2022-11-25,5976,5469.0


In [14]:
train, test = train_test_split(df, test_size=0.3, shuffle=False)

In [15]:
test = test.drop(test.tail(1).index)

In [16]:
test['baseline_pred'] = test['positive']
test

Unnamed: 0,date,positive,y,baseline_pred
700,2022-01-31,10185,16021.0,10185
701,2022-02-01,16021,17895.0,16021
702,2022-02-02,17895,27197.0,17895
703,2022-02-03,27197,32211.0,27197
704,2022-02-04,32211,33729.0,32211
...,...,...,...,...
994,2022-11-21,4306,7644.0,4306
995,2022-11-22,7644,7221.0,7644
996,2022-11-23,7221,7110.0,7221
997,2022-11-24,7110,5976.0,7110


# Fitting the models: DecisionTree, RandomForest, GradientBoost

## Helper methods

In [17]:
def evaluate(models, X_test, y_test):
    for model in models:
        y_pred = model.predict(X_test)
        print(f'{model.__class__.__name__} MAPE: {mape(y_test, y_pred) * 100}')
        print(f'{model.__class__.__name__} R2: {r2_score(y_test, y_pred)}')
        print(f'{model.__class__.__name__} MAE: {mae(y_test, y_pred)}')
        print()

models = ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'Baseline']

# Positive cases

## Predict the next time step using previous time step

In [18]:
from sklearn.tree import DecisionTreeRegressor
X_train = train['positive'].values.reshape(-1,1)
y_train = train['y'].values.reshape(-1,1)
X_test = test['positive'].values.reshape(-1,1)

dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X=X_train, y=y_train)

dt_pred = dt_reg.predict(X_test)
test['dt_pred'] = dt_pred

In [19]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X=X_train, y=y_train.ravel())
rf_pred = rf_reg.predict(X_test)
test['rf_pred'] = rf_pred

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42, n_estimators=400)
gbr.fit(X_train, y=y_train.ravel())
gbr_pred = gbr.predict(X_test)
test['gbr_pred'] = gbr_pred

### Plot the predicted values

In [21]:
fig = make_subplots(rows=2, cols=3, subplot_titles=models)


fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=1)
fig.append_trace(px.line(test, x='date', y='dt_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=1)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=2)
fig.append_trace(px.line(test, x='date', y='rf_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=2)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=3)
fig.append_trace(px.line(test, x='date', y='gbr_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=3)

fig.append_trace(px.line(test, x='date', y='y', color_discrete_sequence=['#000000']).data[0], row=2, col=1)
fig.append_trace(px.line(test, x='date', y='baseline_pred', color_discrete_sequence=['#FF0000']).data[0], row=2, col=1)


fig.update_layout(height=600, width=1200, title_text="Prediction")
fig.show()

In [22]:
evaluate([dt_reg, rf_reg, gbr], X_test, test['y'])

DecisionTreeRegressor MAPE: 25.98863379685502
DecisionTreeRegressor R2: 0.9310649761731304
DecisionTreeRegressor MAE: 1637.7569676700111

RandomForestRegressor MAPE: 21.87570623282158
RandomForestRegressor R2: 0.9388463221377001
RandomForestRegressor MAE: 1518.1547754684927

GradientBoostingRegressor MAPE: 19.89411732163082
GradientBoostingRegressor R2: 0.9337136830963036
GradientBoostingRegressor MAE: 1512.7110150457265



In [23]:
fig = go.Figure(data=[
    go.Bar(name='Baseline', x=['MAPE'], y=[mape(test['y'], test['baseline_pred']) * 100]),
    go.Bar(name='Decision Tree', x=['MAPE'], y=[mape(test['y'], test['dt_pred']) * 100]),
    go.Bar(name='Random Forest', x=['MAPE'], y=[mape(test['y'], test['rf_pred']) * 100]),
    go.Bar(name='Gradient Boosting', x=['MAPE'], y=[mape(test['y'], test['gbr_pred']) * 100])
])

fig.update_layout(barmode='group', title_text='MAPE')
fig.show()

# Forecast

## Forecast based on previous time step

### Get last date of the dataset

In [24]:
last_data = df[-1:].drop(['y'], axis=1)
last_data['date'] = pd.to_datetime(last_data['date'])
last_data = last_data.reset_index(drop=True)
last_data

Unnamed: 0,date,positive
0,2022-11-26,5469


### Forecast the next 7 days

In [25]:
forecast_week_df = pd.DataFrame()

for i in range(7):
    pred = gbr.predict(last_data['positive'].values.reshape(-1, 1))
    date = last_data['date'] + timedelta(days=1)
    last_data['date'] = date
    last_data['positive'] = pred
    forecast_week_df = pd.concat([forecast_week_df, last_data], axis=0)

forecast_week_df

Unnamed: 0,date,positive
0,2022-11-27,5764.278237
0,2022-11-28,5833.875203
0,2022-11-29,5932.561822
0,2022-11-30,5674.544445
0,2022-12-01,5480.662258
0,2022-12-02,5764.278237
0,2022-12-03,5833.875203


### Plot the forecast

In [26]:
fig = px.line(df, x='date', y='positive', color_discrete_sequence=['#000000'])

fig.add_trace(px.line(forecast_week_df, x='date', y='positive', color_discrete_sequence=['#FF0000']).data[0])

fig.update_layout(height=400, width=800, title_text="Forecast")

fig.show()

In [27]:
last_data = df[-1:].drop(['y'], axis=1)
last_data['date'] = pd.to_datetime(last_data['date'])
last_data = last_data.reset_index(drop=True)
last_data

Unnamed: 0,date,positive
0,2022-11-26,5469


### Forecast the next 30 days

In [28]:
forecast_month_df = pd.DataFrame()

for i in range(30):
    pred = gbr.predict(last_data['positive'].values.reshape(-1, 1))
    date = last_data['date'] + timedelta(days=1)
    last_data['date'] = date
    last_data['positive'] = pred
    forecast_month_df = pd.concat([forecast_month_df, last_data], axis=0)

forecast_month_df

Unnamed: 0,date,positive
0,2022-11-27,5764.278237
0,2022-11-28,5833.875203
0,2022-11-29,5932.561822
0,2022-11-30,5674.544445
0,2022-12-01,5480.662258
0,2022-12-02,5764.278237
0,2022-12-03,5833.875203
0,2022-12-04,5932.561822
0,2022-12-05,5674.544445
0,2022-12-06,5480.662258


### Plot the forecast

In [29]:
fig = px.line(df, x='date', y='positive', color_discrete_sequence=['#000000'])

fig.add_trace(px.line(forecast_month_df, x='date', y='positive', color_discrete_sequence=['#FF0000']).data[0])

fig.update_layout(height=400, width=800, title_text="Forecast")

fig.show()

# Play around

In [77]:
data_vac = pd.read_csv('vaksinasi.csv').drop([0])
data_vac

Unnamed: 0,key_as_string,key,doc_count,jumlah_vaksinasi_2/value,jumlah_vaksinasi_1/value,jumlah_jumlah_vaksinasi_1_kum/value,jumlah_jumlah_vaksinasi_2_kum/value
1,2021-02-02,1612224000000,0,0,0,0,0
2,2021-02-03,1612310400000,0,0,0,0,0
3,2021-02-04,1612396800000,0,0,0,0,0
4,2021-02-05,1612483200000,0,0,0,0,0
5,2021-02-06,1612569600000,0,0,0,0,0
...,...,...,...,...,...,...,...
671,2022-12-04,1670112000000,1,32364,22532,203730045,174292461
672,2022-12-05,1670198400000,1,11137,5564,203735609,174303598
673,2022-12-06,1670284800000,1,19545,9595,203745204,174323143
674,2022-12-07,1670371200000,1,22743,14334,203759538,174345886


In [80]:
data_vac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675 entries, 1 to 675
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         675 non-null    datetime64[ns]
 1   vaksinasi-1  675 non-null    int64         
 2   vaksinasi-2  675 non-null    int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 15.9 KB


In [81]:
data_vac.describe()

Unnamed: 0,vaksinasi-1,vaksinasi-2
count,675.0,675.0
mean,301921.9,258360.8
std,424872.2,312746.0
min,-1693353.0,-272304.0
25%,17791.5,23021.0
50%,123282.0,119410.0
75%,481385.0,447945.5
max,4166862.0,1735099.0


In [78]:
columns_name = {
    'key_as_string': 'date',
    'jumlah_vaksinasi_1/value': 'vaksinasi-1',
    'jumlah_vaksinasi_2/value': 'vaksinasi-2',
}

columns_to_drop = ['key', 'doc_count', 'jumlah_jumlah_vaksinasi_1_kum/value', 'jumlah_jumlah_vaksinasi_2_kum/value' ]

data_vac = data_vac.rename(columns=columns_name).drop(columns_to_drop, axis=1)

data_vac['date'] = pd.to_datetime(data_vac['date'])

# move vaksinasi-2 column to the right

cols = list(data_vac.columns.values)
cols.pop(cols.index('vaksinasi-2'))
data_vac = data_vac[cols+['vaksinasi-2']]

data_vac.tail()

Unnamed: 0,date,vaksinasi-1,vaksinasi-2
671,2022-12-04,22532,32364
672,2022-12-05,5564,11137
673,2022-12-06,9595,19545
674,2022-12-07,14334,22743
675,2022-12-08,37772,47652


In [85]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Vaccination'))

fig.append_trace(px.line(data_vac, x='date', y='vaksinasi-1', color_discrete_sequence=['#000000']).data[0], row=1, col=1)
fig.append_trace(px.line(data_vac, x='date', y='vaksinasi-2', color_discrete_sequence=['#FF0000']).data[0], row=1, col=1)

fig.update_layout(height=800, width=800, title_text="Vaksinasi")

fig.show()
