## Imports

In [12]:
import pandas as pd
import plotly.express as px

## Data Preparation
Reading & Cleaning aggregated data of 5 ATMs


In [13]:
aggregated_data = pd.read_csv('/content/aggregated_atm_data.csv')
aggregated_data.head(3)

Unnamed: 0,ATM Name,Transaction Date,No Of Withdrawals,No Of XYZ Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,Amount withdrawn XYZ Card,Amount withdrawn Other Card,Weekday,Festival Religion,Working Day,Holiday Sequence
0,Big Street ATM,1/1/2011,50,20,30,123800,41700,82100,Saturday,H,H,WHH
1,Mount Road ATM,1/1/2011,253,67,186,767900,270900,497000,Saturday,C,H,WHH
2,Airport ATM,1/1/2011,98,56,42,503400,347700,155700,Saturday,C,H,WHH


In [14]:
aggregated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11589 entries, 0 to 11588
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   ATM Name                      11589 non-null  object
 1   Transaction Date              11589 non-null  object
 2   No Of Withdrawals             11589 non-null  int64 
 3   No Of XYZ Card Withdrawals    11589 non-null  int64 
 4   No Of Other Card Withdrawals  11589 non-null  int64 
 5   Total amount Withdrawn        11589 non-null  int64 
 6   Amount withdrawn XYZ Card     11589 non-null  int64 
 7   Amount withdrawn Other Card   11589 non-null  int64 
 8   Weekday                       11589 non-null  object
 9   Festival Religion             11589 non-null  object
 10  Working Day                   11589 non-null  object
 11  Holiday Sequence              11589 non-null  object
dtypes: int64(6), object(6)
memory usage: 1.1+ MB


#### Convert Transaction Date column to datetime
But before this we need to fix the Transaction Date column, because it contains dates in slash format as well as hyphen format

In [15]:
def dateformat_fixer(date):
    # If its part of the first format, convert it to the second one
    if '/' in date:
        date_parts = date.split('/')
        
        year = date_parts[2]
        month = date_parts[1]
        day = date_parts[0]
        
        if int(month) < 10:
            month = '0' + month
        if int(day) < 10:
            day = '0' + day
        
        return day + '-' + month + '-' + year
    # If its already in second format, keep it as it is
    else:
        return date

aggregated_data['Transaction Date'] = aggregated_data.apply(lambda row: dateformat_fixer(row['Transaction Date']), axis=1)

In [16]:
aggregated_data['Transaction Date'] = pd.to_datetime(aggregated_data['Transaction Date'], format='%d-%m-%Y')
aggregated_data.head(3)

Unnamed: 0,ATM Name,Transaction Date,No Of Withdrawals,No Of XYZ Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,Amount withdrawn XYZ Card,Amount withdrawn Other Card,Weekday,Festival Religion,Working Day,Holiday Sequence
0,Big Street ATM,2011-01-01,50,20,30,123800,41700,82100,Saturday,H,H,WHH
1,Mount Road ATM,2011-01-01,253,67,186,767900,270900,497000,Saturday,C,H,WHH
2,Airport ATM,2011-01-01,98,56,42,503400,347700,155700,Saturday,C,H,WHH


#### Convert Weekday to all caps
Some weekday values are in the format 'Sunday' and others in 'SUNDAY`

In [17]:
aggregated_data['Weekday'] = aggregated_data['Weekday'].str.upper()

#### Keep only Big Street ATM

In [18]:
big_street = aggregated_data[aggregated_data['ATM Name'] == 'Big Street ATM']
big_street.head(2)

Unnamed: 0,ATM Name,Transaction Date,No Of Withdrawals,No Of XYZ Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,Amount withdrawn XYZ Card,Amount withdrawn Other Card,Weekday,Festival Religion,Working Day,Holiday Sequence
0,Big Street ATM,2011-01-01,50,20,30,123800,41700,82100,SATURDAY,H,H,WHH
5,Big Street ATM,2011-01-02,17,5,12,52800,20900,31900,SUNDAY,NH,H,HHW


#### Plot Big Street Withdrawal (All years)

In [19]:
fig = px.line(aggregated_data, x='Transaction Date', y='Total amount Withdrawn', 
                      labels={'x':'Test Row Number', 'y':'Residual Error'}, title='Residual Error Plot')
fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

#### One Hot Encoding for Categorical Features

In [20]:
categorical_features_list = ['Weekday', 'Festival Religion', 'Working Day', 'Holiday Sequence']
big_street = pd.get_dummies(big_street, columns=categorical_features_list, drop_first=True)

#### Adding lookback of 5 days

In [21]:
for obs in range(1,6):
    big_street["Total amount Withdrawn_" + str(obs)] = big_street['Total amount Withdrawn'].shift(obs, fill_value=0)

big_street.tail(4)

Unnamed: 0,ATM Name,Transaction Date,No Of Withdrawals,No Of XYZ Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,Amount withdrawn XYZ Card,Amount withdrawn Other Card,Weekday_MONDAY,Weekday_SATURDAY,Weekday_SUNDAY,Weekday_THURSDAY,Weekday_TUESDAY,Weekday_WEDNESDAY,Festival Religion_H,Festival Religion_M,Festival Religion_N,Festival Religion_NH,Working Day_W,Holiday Sequence_HHW,Holiday Sequence_HWH,Holiday Sequence_HWW,Holiday Sequence_WHH,Holiday Sequence_WHW,Holiday Sequence_WWH,Holiday Sequence_WWW,Total amount Withdrawn_1,Total amount Withdrawn_2,Total amount Withdrawn_3,Total amount Withdrawn_4,Total amount Withdrawn_5
11569,Big Street ATM,2017-09-26,131,36,95,437400,162700,274700,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,513800,401500,558900,341600,120400
11574,Big Street ATM,2017-09-27,130,40,90,511200,215000,296200,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,437400,513800,401500,558900,341600
11579,Big Street ATM,2017-09-28,137,41,96,468600,221800,246800,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,511200,437400,513800,401500,558900
11584,Big Street ATM,2017-09-29,137,34,103,468800,146200,322600,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,468600,511200,437400,513800,401500


## Train and Test Data Split
Consider January 2011 to December 2015 for training data, January 2015 to December 2015 for validation and January 2016 to October 2016 for test

In [22]:
big_street_train = big_street.loc[(big_street['Transaction Date'] >= pd.to_datetime("01/01/2011")) & \
                                  (big_street['Transaction Date'] < pd.to_datetime("01/01/2015"))]

big_street_val = big_street.loc[(big_street['Transaction Date'] >= pd.to_datetime("01/01/2015")) & \
                                (big_street['Transaction Date'] < pd.to_datetime("01/01/2016"))]
                                
big_street_test = big_street.loc[(big_street['Transaction Date'] >= pd.to_datetime("01/01/2016")) & \
                                 (big_street['Transaction Date'] < pd.to_datetime("31/10/2016"))]

big_street_train.tail(2)                                 

Unnamed: 0,ATM Name,Transaction Date,No Of Withdrawals,No Of XYZ Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn,Amount withdrawn XYZ Card,Amount withdrawn Other Card,Weekday_MONDAY,Weekday_SATURDAY,Weekday_SUNDAY,Weekday_THURSDAY,Weekday_TUESDAY,Weekday_WEDNESDAY,Festival Religion_H,Festival Religion_M,Festival Religion_N,Festival Religion_NH,Working Day_W,Holiday Sequence_HHW,Holiday Sequence_HWH,Holiday Sequence_HWW,Holiday Sequence_WHH,Holiday Sequence_WHW,Holiday Sequence_WWH,Holiday Sequence_WWW,Total amount Withdrawn_1,Total amount Withdrawn_2,Total amount Withdrawn_3,Total amount Withdrawn_4,Total amount Withdrawn_5
7214,Big Street ATM,2014-12-30,129,42,87,531100,234800,296300,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,367500,315000,396900,239800,351100
7219,Big Street ATM,2014-12-31,172,54,118,679200,317700,361500,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,531100,367500,315000,396900,239800


In [23]:
clean_train = big_street_train[['Total amount Withdrawn', "Weekday_MONDAY", "Weekday_SATURDAY", "Weekday_SUNDAY", "Weekday_THURSDAY", "Weekday_TUESDAY", "Weekday_WEDNESDAY", "Festival Religion_H", "Festival Religion_M","Festival Religion_N", "Festival Religion_NH", "Working Day_W", "Holiday Sequence_HHW", "Holiday Sequence_HWH" , "Holiday Sequence_HWW", "Holiday Sequence_WHH", "Holiday Sequence_WHW", "Holiday Sequence_WWH", "Holiday Sequence_WWW", "Total amount Withdrawn_1", "Total amount Withdrawn_2", "Total amount Withdrawn_3", "Total amount Withdrawn_4", "Total amount Withdrawn_5"]]
clean_val = big_street_val[['Total amount Withdrawn', "Weekday_MONDAY", "Weekday_SATURDAY", "Weekday_SUNDAY", "Weekday_THURSDAY", "Weekday_TUESDAY", "Weekday_WEDNESDAY", "Festival Religion_H", "Festival Religion_M","Festival Religion_N", "Festival Religion_NH", "Working Day_W", "Holiday Sequence_HHW", "Holiday Sequence_HWH" , "Holiday Sequence_HWW", "Holiday Sequence_WHH", "Holiday Sequence_WHW", "Holiday Sequence_WWH", "Holiday Sequence_WWW", "Total amount Withdrawn_1", "Total amount Withdrawn_2", "Total amount Withdrawn_3", "Total amount Withdrawn_4", "Total amount Withdrawn_5"]]
clean_test = big_street_test[['Total amount Withdrawn', "Weekday_MONDAY", "Weekday_SATURDAY", "Weekday_SUNDAY", "Weekday_THURSDAY", "Weekday_TUESDAY", "Weekday_WEDNESDAY", "Festival Religion_H", "Festival Religion_M","Festival Religion_N", "Festival Religion_NH", "Working Day_W", "Holiday Sequence_HHW", "Holiday Sequence_HWH" , "Holiday Sequence_HWW", "Holiday Sequence_WHH", "Holiday Sequence_WHW", "Holiday Sequence_WWH", "Holiday Sequence_WWW", "Total amount Withdrawn_1", "Total amount Withdrawn_2", "Total amount Withdrawn_3", "Total amount Withdrawn_4", "Total amount Withdrawn_5"]]
clean_train.head(2)

Unnamed: 0,Total amount Withdrawn,Weekday_MONDAY,Weekday_SATURDAY,Weekday_SUNDAY,Weekday_THURSDAY,Weekday_TUESDAY,Weekday_WEDNESDAY,Festival Religion_H,Festival Religion_M,Festival Religion_N,Festival Religion_NH,Working Day_W,Holiday Sequence_HHW,Holiday Sequence_HWH,Holiday Sequence_HWW,Holiday Sequence_WHH,Holiday Sequence_WHW,Holiday Sequence_WWH,Holiday Sequence_WWW,Total amount Withdrawn_1,Total amount Withdrawn_2,Total amount Withdrawn_3,Total amount Withdrawn_4,Total amount Withdrawn_5
0,123800,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,52800,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,123800,0,0,0,0


## Random Forest Regressor

### Fitting the model

In [24]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [25]:
X_train,y_train = clean_train.drop(["Total amount Withdrawn"],axis=1), clean_train['Total amount Withdrawn']
X_test,y_test = clean_test.drop(["Total amount Withdrawn"],axis=1), clean_test['Total amount Withdrawn']
X_val,y_val = clean_val.drop(["Total amount Withdrawn"],axis=1), clean_val['Total amount Withdrawn']

In [26]:
rfr = RandomForestRegressor(random_state=2017,verbose=0,n_jobs=5)
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=5, oob_score=False,
                      random_state=2017, verbose=0, warm_start=False)

### Results and Errors

In [27]:
big_street_test['Total amount Withdrawn_RFR'] = rfr.predict(X_test)

print("Root Mean Squared Errors")
print(f'Train R2 Score => {rfr.score(X_train, y_train)}, RMSE => {mean_squared_error(y_train, rfr.predict(X_train), squared=False)}')
print(f'Val R2 Score => {rfr.score(X_val, y_val)}, RMSE => {mean_squared_error(y_val, rfr.predict(X_val), squared=False)}')
print(f'Test R2 Score => {rfr.score(X_test, y_test)}, RMSE => {mean_squared_error(y_test, rfr.predict(X_test), squared=False)}')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Root Mean Squared Errors
Train R2 Score => 0.9409977601364572, RMSE => 33699.01356282539
Val R2 Score => -0.009614374595675601, RMSE => 122913.36231483902
Test R2 Score => -0.036848483600424364, RMSE => 158937.31775906132


## LSTM

### Modelling

In [28]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [29]:
model_k = Sequential()
model_k.add(LSTM(1, input_shape=(1,23)))
model_k.add(Dense(1))
model_k.compile(loss='mean_squared_error', optimizer='adam')

### Reshaping the data

In [30]:
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val_reshaped = X_val.values.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

### Fitting the model

In [31]:
history = model_k.fit(X_train_reshaped, y_train, validation_data=(X_val_reshaped, y_val),\
epochs=10, batch_size=96, verbose=2)

Epoch 1/10
16/16 - 0s - loss: 86451863552.0000 - val_loss: 183335190528.0000
Epoch 2/10
16/16 - 0s - loss: 86451863552.0000 - val_loss: 183335174144.0000
Epoch 3/10
16/16 - 0s - loss: 86451838976.0000 - val_loss: 183335157760.0000
Epoch 4/10
16/16 - 0s - loss: 86451830784.0000 - val_loss: 183335157760.0000
Epoch 5/10
16/16 - 0s - loss: 86451822592.0000 - val_loss: 183335141376.0000
Epoch 6/10
16/16 - 0s - loss: 86451822592.0000 - val_loss: 183335141376.0000
Epoch 7/10
16/16 - 0s - loss: 86451822592.0000 - val_loss: 183335108608.0000
Epoch 8/10
16/16 - 0s - loss: 86451789824.0000 - val_loss: 183335108608.0000
Epoch 9/10
16/16 - 0s - loss: 86451789824.0000 - val_loss: 183335092224.0000
Epoch 10/10
16/16 - 0s - loss: 86451781632.0000 - val_loss: 183335092224.0000


### Results and Errors

In [32]:
y_train_pred = model_k.predict(X_train_reshaped)
y_val_pred = model_k.predict(X_val_reshaped)
y_test_pred = model_k.predict(X_test_reshaped)

print("Root Mean Squared Errors")
print(f'Train RMSE => {mean_squared_error(y_train, y_train_pred, squared=False)}')
print(f'Val RMSE => {mean_squared_error(y_val, y_val_pred, squared=False)}')
print(f'Test RMSE => {mean_squared_error(y_test, y_test_pred, squared=False)}')

Root Mean Squared Errors
Train RMSE => 294026.83542965923
Val RMSE => 428176.46830792056
Test RMSE => 440954.88838629425
