# Forecasting

In [57]:
from feature_generation import *

import pandas as pd

In [58]:
df = pd.read_csv("DATA_sample_atm.csv")
targets = ['CashIn', 'CashOut']
means = df.groupby("HistoryDate").mean()[targets]
means = means[:-135]
means = clean_data(means)

In [59]:
feature_set = get_feature_sets(means, targets)

## Calculating Error

Common way of calculating error in timeseries forecasting is [MAPE](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)

In [60]:
def mape_error(y_actual, y_pred):
    return 100 * ((y_actual - y_pred).abs() / y_actual).mean()

## Base Error

To evaluate our models and see if they are useful, we need a base error to compare them with. One way of doing this is to use shifted data as predictions.

In [61]:
base_errors = pd.DataFrame(columns=['%s_Error' % target for target in targets])
for target in targets:
    windows = get_windows(means[target], 21, drop_t=False)
    for column in windows.columns[1:]:
        base_errors.loc[column, target + '_Error'] = mape_error(windows['t'], windows[column])
    base_errors.loc['t-(7,14)', target + '_Error'] = mape_error(windows['t'], windows[['t-7', 't-14']].mean(axis=1))
    base_errors.loc['t-(7,14,21)', target + '_Error'] = mape_error(windows['t'], windows[['t-7', 't-14', 't-21']].mean(axis=1))

In [62]:
base_errors.sort_values(by='CashIn_Error')[:3]

Unnamed: 0,CashIn_Error,CashOut_Error
"t-(7,14,21)",12.471977,27.691305
"t-(7,14)",13.39542,29.81759
t-14,14.631056,35.001829


In [63]:
base_errors.sort_values(by='CashOut_Error')[:3]

Unnamed: 0,CashIn_Error,CashOut_Error
"t-(7,14,21)",12.471977,27.691305
t-1,30.247342,29.27443
"t-(7,14)",13.39542,29.81759


### Base errors:

Averages of t-7, t-14 and t-21 performed best for both features we want to predict.

* CashIn: 12.47
* CashOut: 27.69

## Forecasting

### Models to try:
* Random forest
* Lightgbm
* Catboost

### Random Forest

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [84]:
PREDICT_FEATURE_INDEX = 1
target = feature_set.columns[PREDICT_FEATURE_INDEX]

X = feature_set[feature_set.columns[2:]]
y = feature_set[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

forest = RandomForestRegressor(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

print("Predicting: %s" % target)

Predicting: CashOut


In [85]:
mape_error(forest.predict(X_train), y_train)

5.915253094687992

In [86]:
mape_error(forest.predict(X_test), y_test)

12.50058401814717

In [88]:
import plotly.graph_objects as go

fig = go.Figure()
# Create and style traces
fig.add_trace(go.Scatter(x=means.index, y=means[target], name='%s Actual'%target, line=dict(color='rgba(255,0,0,0.6)')))
fig.add_trace(go.Scatter(x=X.index, y=forest.predict(X), name = '%s Predicted'%target, line=dict(color='rgba(30,30,200,0.5)')))
                         # Edit the layout
fig.update_layout(title='%s Prediction and Actual Comparison'%target, xaxis_title='Date', yaxis_title=target)

fig.show()