# Clustering ATMs

In this notebook, I will pick n atm ids (n=50) and generate a dataset for clustering.

In [1]:
sys.path.append(r'../atm_demand')
from feature_generation import *

import pandas as pd
from scipy.special import softmax
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("../atm_demand/DATA_sample_atm.csv")

In [3]:
df['AtmId'].value_counts()[:50].index

Int64Index([ 26637,  27687,  27663,  33817,  33821,  27651,  32785,  33793,
             33795,  33796, 119852,  33800,  27664,  27667,  33807,  33850,
             33774,  33852,  27711,  33827,  33828,  33830,  27707,  33831,
             33833,  27727,  27723,  33879,  27714,  33862,  33809,  33771,
             27729,  27570,  27537,  33677,  27539,  33679,  33713,  33714,
             27556,  33724,  33726,  27579,  33707,  27568,  33711, 119801,
             33766,  33750],
           dtype='int64')

## Generating Datasets with Data of All ATMs

In [4]:
CLUSTER_WITH = 'CashIn'

clustering_df = pd.DataFrame(dtype='float64')
feature_sets = []
atmIds = df['AtmId'].value_counts()[:50].index


for atmId in atmIds:
    atm_df = get_atm(df, atmId)
    atm_df = atm_df[:-135]
    atm_df = clean_data(atm_df, drop_zeros=True)
    for target in ['CashIn', 'CashOut']:
        atm_df[target] = np.log(atm_df[target] + 1e-6)

    day_of_the_week_index = get_day_of_the_week_index(atm_df.index)

    for i in range(7):
        clustering_df.loc[atmId, i] = atm_df.loc[day_of_the_week_index[day_of_the_week_index == i].index].mean()[CLUSTER_WITH]

    atm_df['AtmId'] = atmId
    feature_set = get_feature_sets(atm_df, ['CashIn', 'CashOut'])
    feature_sets.append(feature_set)

all_atms_feature_set = pd.concat(feature_sets, axis=0)

## Applying Kmeans and Updating the Dataset

In [5]:
kmeans = KMeans(n_clusters=7, random_state=0).fit(softmax(clustering_df.to_numpy(), axis = 1))

d = {atmId:label for atmId, label in zip(atmIds, kmeans.labels_)}

all_atms_feature_set['ClusterId'] = all_atms_feature_set['AtmId'].map(d)
all_atms_feature_set.drop(columns = 'AtmId', inplace = True)
all_atms_feature_set

Unnamed: 0_level_0,CashIn,CashOut,CashIn_average_7,CashIn_average_30,CashOut_average_7,CashOut_average_30,CashIn_trend_7,CashOut_trend_7,CashIn_t-1,CashIn_t-2,...,ramazan_in_7_days,is_kurban,kurban_in_7_days,is_cocuk_bayrami,is_isci_bayrami,is_spor_bayrami,is_zafer_bayrami,is_cumhuriyet_bayrami,Special_Dates_Index,ClusterId
HistoryDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-02-10,10.920890,10.504519,10.822261,10.744681,10.767122,10.575975,11.754570,11.104149,10.746347,11.562477,...,0,0,0,0,0,0,0,0,0,0
2016-02-11,11.015838,10.706610,10.744636,10.740747,10.733717,10.569419,9.507388,12.315744,10.920890,10.746347,...,0,0,0,0,0,0,0,0,0,0
2016-02-12,11.035051,10.767432,10.837836,10.777842,10.670384,10.565401,10.429658,11.166327,11.015838,10.920890,...,0,0,0,0,0,0,0,0,0,0
2016-02-13,9.284520,10.609798,10.920457,10.781792,10.646156,10.569444,10.011447,10.642880,11.035051,11.015838,...,0,0,0,0,0,0,0,0,0,0
2016-02-14,9.109414,10.211340,10.740817,10.724673,10.638104,10.592623,12.270377,10.392338,9.284520,11.035051,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-18,10.739977,10.714196,10.952361,10.924747,11.079741,10.926192,11.060502,11.145162,11.274414,10.946058,...,0,0,0,0,0,0,0,0,0,6
2020-01-19,10.838404,10.557114,10.930976,10.924856,11.032711,10.931290,10.463739,10.440118,10.739977,11.274414,...,0,0,0,0,0,0,0,0,0,6
2020-01-20,11.208979,10.750428,10.993560,10.905532,11.006665,10.916725,10.802791,10.809237,10.838404,10.739977,...,0,0,0,0,0,0,0,0,0,6
2020-01-21,10.596385,10.544288,11.000106,10.911860,10.969905,10.911534,10.581003,10.324338,11.208979,10.838404,...,0,0,0,0,0,0,0,0,0,6


## Using the Dataset to Train Models

In [6]:
all_atms_feature_set.columns

Index(['CashIn', 'CashOut', 'CashIn_average_7', 'CashIn_average_30',
       'CashOut_average_7', 'CashOut_average_30', 'CashIn_trend_7',
       'CashOut_trend_7', 'CashIn_t-1', 'CashIn_t-2', 'CashIn_t-3',
       'CashIn_t-4', 'CashIn_t-5', 'CashIn_t-6', 'CashIn_t-7', 'CashIn_t-8',
       'CashIn_t-9', 'CashIn_t-10', 'CashIn_t-11', 'CashIn_t-12',
       'CashIn_t-13', 'CashIn_t-14', 'CashOut_t-1', 'CashOut_t-2',
       'CashOut_t-3', 'CashOut_t-4', 'CashOut_t-5', 'CashOut_t-6',
       'CashOut_t-7', 'CashOut_t-8', 'CashOut_t-9', 'CashOut_t-10',
       'CashOut_t-11', 'CashOut_t-12', 'CashOut_t-13', 'CashOut_t-14',
       'CashOut_t-15', 'CashOut_t-16', 'CashOut_t-17', 'CashOut_t-18',
       'CashOut_t-19', 'CashOut_t-20', 'CashOut_t-21', 'CashOut_t-22',
       'CashOut_t-23', 'CashOut_t-24', 'CashOut_t-25', 'CashOut_t-26',
       'CashOut_t-27', 'CashOut_t-28', 'CashOut_t-29', 'CashOut_t-30',
       'CashOut_t-31', 'CashOut_t-32', 'CashOut_t-33', 'CashOut_t-34',
       'CashOut_t-35', '