In [7]:
import sys
sys.path.append("../")
from config import config
from feature_generation import get_all_atms_feature_set, format_dates
from preprocessing import get_input_sets, scaler_fit_transform, scaler_transform, scaler_inverse_transform
from tabTransformer import TabTransformer
from misc import nmae_error, load_pickle

import pandas as pd
import tensorflow as tf

## Load Data

In [8]:
load_config = config['load_config']

try:
    config = read_hyperparameters_from_file("../" + load_config['hyperparameter_path'])
except:
    print("WARNING: Hyperparameter file (%s) not found. Using the default config." % load_config['hyperparameter_path'])

clusters = load_config['clusters']

df = pd.read_csv("../" + load_config['data_path'])
all_atms_feature_set = get_all_atms_feature_set(df, first_n = load_config['n_atms'])
all_atms_feature_set.sort_index(inplace = True)

# Reading Pickles
for cluster_feature in clusters:
    d = load_pickle("../" + clusters[cluster_feature]['path'])
    all_atms_feature_set[cluster_feature] = all_atms_feature_set['AtmId'].map(d)

all_atms_feature_set['year']    = format_dates(all_atms_feature_set.index, '%Y', 'year')
all_atms_feature_set['year']    -= all_atms_feature_set['year'].min()
all_atms_feature_set['quarter'] = (all_atms_feature_set['Month_of_the_Year_Index'] / 3).astype('int8')

all_atms_feature_set['fold_index'] = 4 * all_atms_feature_set['year'] + all_atms_feature_set['quarter']



In [9]:
pivoted_feature_set = all_atms_feature_set.reset_index().pivot_table(index=['year', 'quarter', 'HistoryDate', 'AtmId'])
pivoted_feature_set

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CashIn,CashIn_average_14,CashIn_average_30,CashIn_average_7,CashIn_std_14,CashIn_std_30,CashIn_std_7,CashIn_t-1,CashIn_t-10,CashIn_t-11,...,is_cocuk_bayrami,is_cumhuriyet_bayrami,is_isci_bayrami,is_kurban,is_ramazan,is_spor_bayrami,is_zafer_bayrami,kurban_in_7_days,next_month_1_delta,ramazan_in_7_days
year,quarter,HistoryDate,AtmId,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0,0,2016-01-31,26637,7870.0,59583.571429,47928.333333,48987.142857,33667.710760,30126.510846,26801.456872,64380.0,78850.0,31310.0,...,0,0,0,0,0,0,0,0,1,0
0,0,2016-01-31,27663,10190.0,27306.428571,34625.000000,29490.000000,14840.796881,19535.954705,16388.896648,24380.0,23760.0,48160.0,...,0,0,0,0,0,0,0,0,1,0
0,0,2016-01-31,27687,36670.0,70209.285714,70236.333333,66431.428571,25820.278395,33583.457184,15830.532133,46460.0,68310.0,98530.0,...,0,0,0,0,0,0,0,0,1,0
0,0,2016-01-31,33817,5110.0,6880.000000,7007.000000,7160.000000,3271.821417,4187.279711,3156.601548,5710.0,5360.0,13660.0,...,0,0,0,0,0,0,0,0,1,0
0,0,2016-01-31,33821,12160.0,20788.571429,21652.333333,18794.285714,8644.051629,8606.066298,10956.592927,16310.0,31930.0,22610.0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,2020-01-22,26637,71770.0,65142.142857,79250.833333,65969.285714,33459.439436,42845.657811,37481.744938,124405.0,25875.0,58240.0,...,0,0,0,0,0,0,0,0,10,0
4,0,2020-01-22,27663,32835.0,47832.321429,54744.666667,47324.642857,20096.138178,19667.045414,11542.809308,41217.5,24180.0,43165.0,...,0,0,0,0,0,0,0,0,10,0
4,0,2020-01-22,27687,81350.0,77103.571429,83164.333333,88330.714286,25364.759732,27693.662749,30284.135620,133390.0,38100.0,75585.0,...,0,0,0,0,0,0,0,0,10,0
4,0,2020-01-22,33817,31415.0,39002.142857,42678.666667,42351.428571,13359.509249,15651.384864,15404.408112,31170.0,33925.0,32240.0,...,0,0,0,0,0,0,0,0,10,0


## Setting Features

In [10]:
feature_config  = config['feature_config']

categorical_features = [cat for cat in
    all_atms_feature_set.select_dtypes(include=feature_config['categorical_column_types'])
    if cat not in feature_config['excluded_categorical']]
continuous_features = [cat for cat in
    all_atms_feature_set.select_dtypes(include=feature_config['continuous_column_types'])
    if cat not in feature_config['excluded_continuous']]

groups = [continuous_features]
groups.extend(categorical_features)

## Aranging train/test Data

In [11]:
def get_fold(df, fold_index_feature, current_index, fold_size):
    fold_series = df[fold_index_feature]
    return df[(fold_series >= current_index) & (current_index + fold_size - 1 > fold_series)], df[current_index + fold_size - 1 == fold_series]

In [12]:
fold_size = 4
for fold_index in range(all_atms_feature_set['fold_index'].max() - fold_size + 1):
    df_train, df_test = get_fold(all_atms_feature_set, 'fold_index', fold_index, fold_size)
    
    X_train = df_train[continuous_features + categorical_features]
    X_test  = df_test[continuous_features + categorical_features]
    y_train = df_train[feature_config['target']]
    y_test  = df_test[feature_config['target']]

    # MinMaxTransform
    X_train, y_train, scaler_X, scaler_y = scaler_fit_transform(X_train, y_train, continuous_features)
    X_test, y_test = scaler_transform(X_test, y_test, scaler_X, scaler_y, continuous_features)

    X_train = get_input_sets(X_train, groups)
    X_test  = get_input_sets(X_test, groups)   

    model_config = config['model_config']

    tabTransformer = TabTransformer(
        categories = [len(all_atms_feature_set[cat].unique())
            if cat not in clusters.keys() else
            clusters[cat]['n_clusters']
            for cat in categorical_features],
        num_continuous = len(continuous_features),
        dim = model_config['dim'],
        dim_out = model_config['dim_out'],
        depth = model_config['depth'],
        heads = model_config['heads'],
        attn_dropout = model_config['attn_dropout'],
        ff_dropout = model_config['ff_dropout'],
        mlp_hidden = model_config['mlp_hidden']
    )

    training_config = config['training_config']

    tabTransformer.compile(
        optimizer = tf.optimizers.Adam(learning_rate = training_config['learning_rate']),
        loss = training_config['loss']
    )

    tabTransformer.fit(X_train,
        y_train,
        batch_size = training_config['batch_size'],
        epochs = training_config['epochs'],
        validation_data = (X_test, y_test),
        verbose = training_config['verbose'])

    print("--> Completed fold %d" % fold_index)
    print("Train score: %.4f, test score: %.4f" % 
    (nmae_error(scaler_inverse_transform(y_train, scaler_y), scaler_y.inverse_transform(tabTransformer.predict(X_train))),
    nmae_error(scaler_inverse_transform(y_test, scaler_y), scaler_y.inverse_transform(tabTransformer.predict(X_test)))))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler_X.transform(X[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler_X.transform(X[numerical_features])
A value is try

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--> Completed fold 0
Train score: 0.7346, test score: 0.6662


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler_X.transform(X[numerical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = scaler_X.transform(X[numerical_features])
A value is try

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: 