In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
dataset = pd.read_csv('covid.csv')
dataset.head()

Unnamed: 0,harian/key_as_string,harian/key,harian/doc_count,harian/jumlah_meninggal/value,harian/jumlah_sembuh/value,harian/jumlah_positif/value,harian/jumlah_dirawat/value,harian/jumlah_positif_kum/value,harian/jumlah_sembuh_kum/value,harian/jumlah_meninggal_kum/value,harian/jumlah_dirawat_kum/value
0,2020-03-02T00:00:00.000Z,1583107200000,1,0,0,2,2,2,0,0,2
1,2020-03-03T00:00:00.000Z,1583193600000,1,0,0,0,0,2,0,0,2
2,2020-03-04T00:00:00.000Z,1583280000000,1,0,0,0,0,2,0,0,2
3,2020-03-05T00:00:00.000Z,1583366400000,1,0,0,0,0,2,0,0,2
4,2020-03-06T00:00:00.000Z,1583452800000,1,0,0,2,2,4,0,0,4


In [3]:
columns_name = {
    'harian/key_as_string': 'date',
    'harian/jumlah_meninggal/value': 'deaths',
    'harian/jumlah_sembuh/value': 'recovered',
    'harian/jumlah_positif/value': 'positive',
    'harian/jumlah_dirawat/value': 'hospitalized',
    'harian/jumlah_positif_kum/value': 'positive_cum',
    'harian/jumlah_sembuh_kum/value': 'recovered_cum',
    'harian/jumlah_meninggal_kum/value': 'deaths_cum',
    'harian/jumlah_dirawat_kum/value': 'hospitalized_cum',
}

data = dataset.rename(columns=columns_name).drop(columns=['harian/key', 'harian/doc_count'])

data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].dt.strftime('%Y-%m-%d')

#create month
data['day'] = pd.DatetimeIndex(data['date']).day
data['month'] = pd.DatetimeIndex(data['date']).month
data['year'] = pd.DatetimeIndex(data['date']).year

data.head()

Unnamed: 0,date,deaths,recovered,positive,hospitalized,positive_cum,recovered_cum,deaths_cum,hospitalized_cum,day,month,year
0,2020-03-02,0,0,2,2,2,0,0,2,2,3,2020
1,2020-03-03,0,0,0,0,2,0,0,2,3,3,2020
2,2020-03-04,0,0,0,0,2,0,0,2,4,3,2020
3,2020-03-05,0,0,0,0,2,0,0,2,5,3,2020
4,2020-03-06,0,0,2,2,4,0,0,4,6,3,2020


In [4]:
print(f'The dataset has {data.shape[0]} rows and {data.shape[1]} columns')
print(f'The dataset has {data.isnull().sum().sum()} null values')

The dataset has 1000 rows and 12 columns
The dataset has 0 null values


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date              1000 non-null   object
 1   deaths            1000 non-null   int64 
 2   recovered         1000 non-null   int64 
 3   positive          1000 non-null   int64 
 4   hospitalized      1000 non-null   int64 
 5   positive_cum      1000 non-null   int64 
 6   recovered_cum     1000 non-null   int64 
 7   deaths_cum        1000 non-null   int64 
 8   hospitalized_cum  1000 non-null   int64 
 9   day               1000 non-null   int64 
 10  month             1000 non-null   int64 
 11  year              1000 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 93.9+ KB


In [6]:
data.describe()

Unnamed: 0,deaths,recovered,positive,hospitalized,positive_cum,recovered_cum,deaths_cum,hospitalized_cum,day,month,year
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,159.641,6424.332,6646.093,62.12,3000363.0,2824214.0,83792.88,92355.812,15.701,6.64,2021.025
std,304.088464,9834.764572,10670.505976,6153.400169,2454453.0,2386757.0,65553.527747,125435.065064,8.780244,3.236182,0.796875
min,0.0,0.0,0.0,-29938.0,2.0,0.0,0.0,2.0,1.0,1.0,2020.0
25%,16.0,788.0,900.75,-681.75,432770.5,363489.0,14515.5,16388.0,8.0,4.0,2020.0
50%,64.5,3494.5,3367.0,79.5,2698424.0,2166888.0,69701.0,48433.0,16.0,7.0,2021.0
75%,164.0,6351.75,6278.25,702.25,5969048.0,5617216.0,153934.5,109546.5,23.0,9.0,2022.0
max,2069.0,61361.0,64718.0,39165.0,6646093.0,6424332.0,159641.0,586113.0,31.0,12.0,2022.0


In [7]:
# turn all into positive hospitalization
data['hospitalized'] = data['hospitalized'].abs()

# Positive Cases

In [8]:
monthly = data.groupby('month').agg(np.mean).reset_index()
yearly = data.groupby('year').sum().reset_index()

fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='positive').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='positive').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='positive').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Positive Cases")

fig.show()

# Recovered Cases

In [9]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='recovered').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='recovered').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='recovered').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Recovered Cases")

fig.show()

# Death Cases

In [10]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='deaths').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='deaths').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='deaths').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Deaths Cases")

fig.show()

# Hospitalized Cases

In [11]:
fig = make_subplots(rows=3, cols=1, subplot_titles=('Daily', 'Average Monthly', 'Total Yearly'))

fig.append_trace(px.line(data, x='date', y='hospitalized').data[0], row=1, col=1)
fig.append_trace(px.line(monthly, x='month', y='hospitalized').data[0], row=2, col=1)
fig.append_trace(px.line(yearly, x='year', y='hospitalized').data[0], row=3, col=1)

fig.update_layout(height=800, width=800, title_text="Hospitalized Cases")

fig.show()

In [12]:
df = data.copy()
df = df[['date', 'positive']]
df.head()

Unnamed: 0,date,positive
0,2020-03-02,2
1,2020-03-03,0
2,2020-03-04,0
3,2020-03-05,0
4,2020-03-06,2


Predict next time step using previous time step

In [13]:
df['y'] = df['positive'].shift(-1)
df

Unnamed: 0,date,positive,y
0,2020-03-02,2,0.0
1,2020-03-03,0,0.0
2,2020-03-04,0,0.0
3,2020-03-05,0,2.0
4,2020-03-06,2,0.0
...,...,...,...
995,2022-11-22,7644,7221.0
996,2022-11-23,7221,7110.0
997,2022-11-24,7110,5976.0
998,2022-11-25,5976,5469.0


In [14]:
train, test = train_test_split(df, test_size=0.2, shuffle=False)

In [15]:
test = test.drop(test.tail(1).index)

In [16]:
test['baseline_pred'] = test['positive']
test

Unnamed: 0,date,positive,y,baseline_pred
800,2022-05-11,400,335.0,400
801,2022-05-12,335,335.0,335
802,2022-05-13,335,308.0,335
803,2022-05-14,308,257.0,308
804,2022-05-15,257,182.0,257
...,...,...,...,...
994,2022-11-21,4306,7644.0,4306
995,2022-11-22,7644,7221.0,7644
996,2022-11-23,7221,7110.0,7221
997,2022-11-24,7110,5976.0,7110


In [17]:
from sklearn.tree import DecisionTreeRegressor
X_train = train['positive'].values.reshape(-1,1)
y_train = train['y'].values.reshape(-1,1)
X_test = test['positive'].values.reshape(-1,1)

dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X=X_train, y=y_train)

dt_pred = dt_reg.predict(X_test)
test['dt_pred'] = dt_pred

In [18]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X=X_train, y=y_train)
rf_pred = rf_reg.predict(X_test)
test['rf_pred'] = rf_pred


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [19]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y=y_train.ravel())
gbr_pred = gbr.predict(X_test)
test['gbr_pred'] = gbr_pred

In [20]:
def mape(y_true, y_pred):
    return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)

In [21]:
fig = make_subplots(rows=1, cols=4, subplot_titles=('Decision Tree', 'Random Forest', 'Gradient Boosting'))

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=1)
fig.append_trace(px.line(test, x='date', y='dt_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=1)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=2)
fig.append_trace(px.line(test, x='date', y='rf_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=2)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=3)
fig.append_trace(px.line(test, x='date', y='gbr_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=3)

fig.update_layout(height=400, width=1200, title_text="Prediction")
fig.show()

In [None]:
def evaluate(models, X_test, y_test):
    for model in models:
        y_pred = model.predict(X_test)
        print(f'{model.__class__.__name__} MAPE: {mape(y_test, y_pred)}')

In [None]:
evaluate([dt_reg, rf_reg, gbr], X_test, test['y'])

In [22]:
fig = go.Figure(data=[
    go.Bar(name='Baseline', x=['Baseline'], y=[mape(test['y'], test['baseline_pred'])]),
    go.Bar(name='Decision Tree', x=['MAPE'], y=[mape(test['y'], test['dt_pred'])]),
    go.Bar(name='Random Forest', x=['MAPE'], y=[mape(test['y'], test['rf_pred'])]),
    go.Bar(name='Gradient Boosting', x=['MAPE'], y=[mape(test['y'], test['gbr_pred'])])
])

fig.update_layout(barmode='group', title_text='MAPE')
fig.show()

DecisionTreeRegressor MAPE: 23.82
RandomForestRegressor MAPE: 19.86
GradientBoostingRegressor MAPE: 17.75


Predict next time step using sequence of previous time steps

In [23]:
def window_input(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
    
    df = data.copy()
    
    i = 1
    while i < window_length:
        df[f'x_{i}'] = df['positive'].shift(-i)
        i = i + 1
        
    if i == window_length:
        df['y'] = df['positive'].shift(-i)
        
    df = df.dropna(axis=0)
        
    return df

In [24]:
new_df = window_input(5, df)

In [25]:
X = new_df[['positive', 'x_1', 'x_2', 'x_3', 'x_4']].values
y = new_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [26]:
baseline_pred = []

for row in X_test:
    baseline_pred.append(np.mean(row))

In [27]:
# decision tree with hyperparameter tuning
dt_reg_5 = DecisionTreeRegressor(random_state=42)

dt_reg_5.fit(X_train, y_train)

dt_reg_5_pred = dt_reg_5.predict(X_test)

test['dt_reg_5_pred'] = dt_reg_5_pred

In [28]:
rf_reg_5 = RandomForestRegressor(random_state=42)

rf_reg_5.fit(X_train, y_train)

rf_reg_5_pred = rf_reg_5.predict(X_test)

test['rf_reg_5_pred'] = rf_reg_5_pred

In [29]:
gbr_5 = GradientBoostingRegressor(random_state=42)

gbr_5.fit(X_train, y_train.ravel())

gbr_5_pred = gbr_5.predict(X_test)

test['gbr_5_pred'] = gbr_5_pred

In [30]:
# plot mape 
fig = make_subplots(rows=1, cols=4, subplot_titles=('Decision Tree', 'Random Forest', 'Gradient Boosting'))

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=1)
fig.append_trace(px.line(test, x='date', y='dt_reg_5_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=1)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=2)
fig.append_trace(px.line(test, x='date', y='rf_reg_5_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=2)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=3)
fig.append_trace(px.line(test, x='date', y='gbr_5_pred', color_discrete_sequence=['#FF0000']).data[0], row=1, col=3)

fig.update_layout(height=400, width=1200, title_text="Prediction")
fig.show()

In [31]:
evaluate([dt_reg_5, rf_reg_5, gbr_5], X_test, y_test)

DecisionTreeRegressor MAPE: 24.08
RandomForestRegressor MAPE: 18.13
GradientBoostingRegressor MAPE: 18.92


In [32]:
# barplot mape
fig = go.Figure(data=[
    go.Bar(name='Baseline', x=['Baseline'], y=[mape(y_test, baseline_pred)]),
    go.Bar(name='Decision Tree', x=['MAPE'], y=[mape(y_test, dt_reg_5_pred)]),
    go.Bar(name='Random Forest', x=['MAPE'], y=[mape(y_test, rf_reg_5_pred)]),
    go.Bar(name='Gradient Boosting', x=['MAPE'], y=[mape(y_test, gbr_5_pred)])
])
# Change the bar mode
fig.update_layout(barmode='group', title_text='MAPE')
fig.show()


In [33]:
def window_input_output(input_length: int, output_length: int, data: pd.DataFrame) -> pd.DataFrame:
    
    df = data.copy()
    
    i = 1
    while i < input_length:
        df[f'x_{i}'] = df['positive'].shift(-i)
        i = i + 1
        
    j = 0
    while j < output_length:
        df[f'y_{j}'] = df['positive'].shift(-output_length-j)
        j = j + 1
        
    df = df.dropna(axis=0)
    
    return df

In [34]:
seq_df = window_input_output(198, 198, df)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [35]:
X_cols = [col for col in seq_df.columns if col.startswith('x')]

X_cols.insert(0, 'positive')

y_cols = [col for col in seq_df.columns if col.startswith('y')]
X_train = seq_df[X_cols][:-2].values
y_train = seq_df[y_cols][:-2].values

X_test = seq_df[X_cols][-2:].values
y_test = seq_df[y_cols][-2:].values

In [36]:
dt_seq = DecisionTreeRegressor(random_state=42)

dt_seq.fit(X_train, y_train)

dt_seq_preds = dt_seq.predict(X_test)


In [37]:
rf_seq = RandomForestRegressor(random_state=42)

rf_seq.fit(X_train, y_train)

rf_seq_preds = rf_seq.predict(X_test)

In [38]:
from sklearn.multioutput import RegressorChain

gbr_seq = GradientBoostingRegressor(random_state=42)

chained_gbr = RegressorChain(gbr_seq)

chained_gbr.fit(X_train, y_train)

gbr_seq_preds = chained_gbr.predict(X_test)

KeyboardInterrupt: 

In [None]:
test['dt_seq_preds'] = dt_seq_preds[0]
test['rf_seq_preds'] = rf_seq_preds[0]
test['gbr_seq_preds'] = gbr_seq_preds[0]

In [None]:
# plot mape 

titles = ['Decision Tree']
fig = make_subplots(rows=1, cols=4, subplot_titles=titles)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=1)
fig.append_trace(px.line(test, x='date', y='dt_seq_preds', color_discrete_sequence=['#FF0000']).data[0], row=1, col=1)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=2)
fig.append_trace(px.line(test, x='date', y='rf_seq_preds', color_discrete_sequence=['#FF0000']).data[0], row=1, col=2)

fig.append_trace(px.line(test, x='date', y='positive', color_discrete_sequence=['#000000']).data[0], row=1, col=3)
fig.append_trace(px.line(test, x='date', y='gbr_seq_preds', color_discrete_sequence=['#FF0000']).data[0], row=1, col=3)

fig.update_layout(height=400, width=1200, title_text="Prediction")
fig.show()

In [None]:
evaluate([dt_seq, rf_seq, chained_gbr], X_test, y_test)

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Decision Tree', x=['MAPE'], y=[mape(y_test, dt_seq_preds)]),
    go.Bar(name='Random Forest', x=['MAPE'], y=[mape(y_test, rf_seq_preds)]),
    go.Bar(name='Gradient Boosting', x=['MAPE'], y=[mape(y_test, gbr_seq_preds)])
])

fig.update_layout(barmode='group', title_text='MAPE')
fig.show()
