# Forecasting

In [57]:
from feature_generation import *

import pandas as pd

In [58]:
df = pd.read_csv("DATA_sample_atm.csv")
targets = ['CashIn', 'CashOut']
means = df.groupby("HistoryDate").mean()[targets]
means = means[:-135]
means = clean_data(means)

In [59]:
feature_set = get_feature_sets(means, targets)

## Calculating Error

Common way of calculating error in timeseries forecasting is [MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)

In [108]:
def mape_error(y_actual, y_pred, mean=True):
    result = 100 * ((y_actual - y_pred).abs() / y_actual)
    if mean:
        return result.mean()
    else:
        return result

## Base Error

To evaluate our models and see if they are useful, we need a base error to compare them with. One way of doing this is to use shifted data as predictions.

In [109]:
base_errors = pd.DataFrame(columns=['%s_Error' % target for target in targets])
for target in targets:
    windows = get_windows(means[target], 21, drop_t=False)
    for column in windows.columns[1:]:
        base_errors.loc[column, target + '_Error'] = mape_error(windows['t'], windows[column])
    base_errors.loc['t-(7,14)', target + '_Error'] = mape_error(windows['t'], windows[['t-7', 't-14']].mean(axis=1))
    base_errors.loc['t-(7,14,21)', target + '_Error'] = mape_error(windows['t'], windows[['t-7', 't-14', 't-21']].mean(axis=1))

In [110]:
base_errors.sort_values(by='CashIn_Error')[:3]

Unnamed: 0,CashIn_Error,CashOut_Error
"t-(7,14,21)",12.471977,27.691305
"t-(7,14)",13.39542,29.81759
t-14,14.631056,35.001829


In [111]:
base_errors.sort_values(by='CashOut_Error')[:3]

Unnamed: 0,CashIn_Error,CashOut_Error
"t-(7,14,21)",12.471977,27.691305
t-1,30.247342,29.27443
"t-(7,14)",13.39542,29.81759


### Base errors:

Averages of t-7, t-14 and t-21 performed best for both features we want to predict.

* CashIn: 12.47
* CashOut: 27.69

## Forecasting

### Models to try:
* Random forest
* Lightgbm
* Catboost

### Random Forest

In [112]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [113]:
PREDICT_FEATURE_INDEX = 1
target = feature_set.columns[PREDICT_FEATURE_INDEX]

X = feature_set[feature_set.columns[2:]]
y = feature_set[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

forest = RandomForestRegressor(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

print("Predicting: %s" % target)

Predicting: CashOut


In [114]:
mape_error(forest.predict(X_train), y_train)

5.209325080575227

In [115]:
mape_error(forest.predict(X_test), y_test)

19.798079481045793

Calculating average error by week to visualize with Plotly

In [130]:
predictions = pd.Series(forest.predict(X), index=X.index)
weekly_errors = mape_error(means[target], predictions, mean=False)
weekly_errors.dropna(inplace=True)
weekly_errors = weekly_errors.resample('w').mean()
weekly_errors

HistoryDate
2016-01-31     0.000000
2016-02-07     4.510660
2016-02-14     7.976832
2016-02-21     2.420828
2016-02-28     3.995240
                ...    
2019-12-29    17.097291
2020-01-05    16.528168
2020-01-12    11.103999
2020-01-19    13.787883
2020-01-26    17.230630
Freq: W-SUN, Length: 209, dtype: float64

Line graph with Plotly to visualize error over time, predictions and actual values.

In [137]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=means.index, y=means[target], name='%s Actual'%target, line=dict(color='rgba(255,0,0,0.6)')), secondary_y=False)
fig.add_trace(go.Scatter(x=X.index, y=forest.predict(X), name = '%s Predicted'%target, line=dict(color='rgba(30,30,200,0.5)')), secondary_y=False)

fig.add_trace(go.Scatter(x=weekly_errors.index, y=weekly_errors, name="Error", line=dict(color='rgba(34, 155, 0, 0.4)', width=4)), secondary_y=True)
# set layout title
fig.update_layout(title='%s Prediction and Actual Comparison'%target)
# set x axis titles
fig.update_xaxes(title_text="Date")
# set y axis titles
fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)

fig.show()