# 7  Modeling - selection of the best deep learning models

<b> Purpose of the action </b> - checking accuracy of prediction on test set using 2 different types of Neural Networks:
- ANN only with Dense layers
- RNN with LSTM layers

<b> </b>
<b> Action plan </b>:
- Select best hyperparameters for both type of Neural Network using ParameterSampler to generate different models with random hyperparameters. For fitting models use training set and for models evaluation use validation set
- Save both models.
- Compare prediction accuracy and other metrics on test set and save results for future purpose

## 7.1 Import nessesary libraries and modules

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from modeling import select_best_nn_classifier, build_ann_classifier, build_rnn_classifier

## 7.2 Create empty lists for future results

In [2]:
accuracy_score = []
precision_score = []
recall_score = []
f1_score = []
roc_auc_score = []

## 7.3 Make test prediction using Artificial Neural Network(ANN)

### 7.3.1 Import data dedicated for this model

In [8]:
train_set = pd.read_csv("./preprocessed_data/processed_base_train_set.csv", index_col=0)
validation_set = pd.read_csv("./preprocessed_data/processed_base_validation_set.csv", index_col=0)
test_set = pd.read_csv("./preprocessed_data/processed_base_test_set.csv", index_col=0)

### 7.3.2 Split datasets to feature set and labels set

In [9]:
X_train, y_train = np.array(train_set.drop(columns='FTR')), np.array(train_set['FTR'])
X_val, y_val = np.array(validation_set.drop(columns='FTR')), np.array(validation_set['FTR'])
X_test, y_test = np.array(test_set.drop(columns='FTR')), np.array(test_set['FTR'])

### 7.3.3 Select the best model using Keras wraper for the Scikit-Learn API and ParameterSampler to generate models with diffrent parameters

Check 10 combinations of neural network models to choose the best one

In [10]:
# define params for random grid search
params_grid = {
    'n_hiden_layers': [1, 2, 3, 4],
    'hidden_layer_size' : [128, 64, 32, 16],
    'batch_size' : [4, 8, 16]
}

# add early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=0,
    patience=10,
    mode='max',
    restore_best_weights=True)

# function to seleting best neural network model 
clf_params, clf = select_best_nn_classifier(build_func=build_ann_classifier, 
                                            params_grid=params_grid,
                                            n_iter=10,
                                            random_state=42,
                                            X_train=X_train,
                                            y_train=y_train, 
                                            X_val=X_val, 
                                            y_val=y_val,
                                            early_stopping=early_stopping,
                                            epochs=100,
                                            shuffle=False, 
                                            verbose=1)

# look on the best chosen params
print('Best params:', clf_params)

KerasClassifier{'n_hiden_layers': 4, 'hidden_layer_size': 32, 'batch_size': 8}
Accuracy score on training set: 0.5278 | Accuracy score on validation set: 0.5152
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_hiden_layers': 1, 'hidden_layer_size': 32, 'batch_size': 16}
Accuracy score on training set: 0.6673 | Accuracy score on validation set: 0.6697
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_hiden_layers': 3, 'hidden_layer_size': 32, 'batch_size': 8}
Accuracy score on training set: 0.6636 | Accuracy score on validation set: 0.6667
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_hiden_layers': 4, 'hidden_layer_size': 32, 'batch_size': 16}
Accuracy score on training set: 0.5278 | Accuracy score on validation set: 0.5152
---------------------------------------------------

### 7.3.4 Calculate metrics of prediction and add results to the lists

In [11]:
# append metrics for single classifier to the list 
accuracy_score.append(metrics.accuracy_score(y_test , clf.predict(X_test)))  
precision_score.append(metrics.precision_score(y_test , clf.predict(X_test)))
recall_score.append(metrics.recall_score(y_test , clf.predict(X_test)))
f1_score.append( metrics.f1_score(y_test , clf.predict(X_test)))
roc_auc_score.append(metrics.roc_auc_score(y_test , clf.predict_proba(X_test)[:,1]))

### 7.3.5 Save the model for future use

In [12]:
# save the model
clf.model.save('./models/ANN.h5')

## 7.4 Make test prediction using Reccurent Neural Network with LSTM layers

### 7.4.1 Import data dedicated for this model

In [3]:
train_set = pd.read_csv("./preprocessed_data/processed_base_train_set.csv", index_col=0)
validation_set = pd.read_csv("./preprocessed_data/processed_base_validation_set.csv", index_col=0)
test_set = pd.read_csv("./preprocessed_data/processed_base_test_set.csv", index_col=0)

### 7.4.2 Split datasets to feature set and labels set

In [4]:
X_train, y_train = np.array(train_set.drop(columns='FTR')), np.array(train_set['FTR'])
X_val, y_val = np.array(validation_set.drop(columns='FTR')), np.array(validation_set['FTR'])
X_test, y_test = np.array(test_set.drop(columns='FTR')), np.array(test_set['FTR'])

# reshape feature sets to the desired shape by RNN
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

### 7.4.3 Select the best model using Keras wraper for the Scikit-Learn API and ParameterSampler to generate models with diffrent parameters

Check 10 combinations of neural network models to choose the best one

In [5]:
# define params for random grid search
params_grid = {
    'n_lstm_layers':[1, 2], 
    'lstm_layer_size': [64, 32],
    'n_hiden_layers': [0, 1],
    'hidden_layer_size' : [32, 16, 8],
    'batch_size' : [2, 4, 8, 16]
}

# add early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=0,
    patience=10,
    mode='max',
    restore_best_weights=True)

# function to seleting best neural network model 
clf_params, clf = select_best_nn_classifier(build_func=build_rnn_classifier, 
                                            params_grid=params_grid,
                                            n_iter=10,
                                            random_state=42,
                                            X_train=X_train,
                                            y_train=y_train, 
                                            X_val=X_val, 
                                            y_val=y_val,
                                            early_stopping=early_stopping,
                                            epochs=100,
                                            shuffle=False, 
                                            verbose=1)

# look on the best chosen params
print('Best params:', clf_params)

KerasClassifier{'n_lstm_layers': 1, 'n_hiden_layers': 0, 'lstm_layer_size': 64, 'hidden_layer_size': 16, 'batch_size': 16}
Accuracy score on training set: 0.6327 | Accuracy score on validation set: 0.6333
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_lstm_layers': 2, 'n_hiden_layers': 0, 'lstm_layer_size': 32, 'hidden_layer_size': 32, 'batch_size': 16}
Accuracy score on training set: 0.648 | Accuracy score on validation set: 0.6333
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_lstm_layers': 2, 'n_hiden_layers': 0, 'lstm_layer_size': 64, 'hidden_layer_size': 32, 'batch_size': 16}
Accuracy score on training set: 0.647 | Accuracy score on validation set: 0.6455
----------------------------------------------------------------------------------------------------
KerasClassifier{'n_lstm_layers': 1, 'n_hiden_layers': 1, 'lstm_layer_size': 32, 'hid

### 7.4.4 Calculate metrics of prediction and add results to the lists

In [6]:
# append metrics for single classifier to the list 
accuracy_score.append(metrics.accuracy_score(y_test , clf.predict(X_test)))  
precision_score.append(metrics.precision_score(y_test , clf.predict(X_test)))
recall_score.append(metrics.recall_score(y_test , clf.predict(X_test)))
f1_score.append( metrics.f1_score(y_test , clf.predict(X_test)))
roc_auc_score.append(metrics.roc_auc_score(y_test , clf.predict_proba(X_test)[:,1]))

### 7.4.5 Save the model for future use

In [7]:
# save the model
clf.model.save('./models/RNN.h5')

## 7.5 Show all result in one table and save it for future purpose

In [15]:
# create dictionary of results 
results_dict = {'precision_score': precision_score, 
               'recall_score': recall_score, 
               'f1_score': f1_score,
               'roc_auc_score' : roc_auc_score,
               'accuracy_score' : accuracy_score}

results_df = pd.DataFrame(data=results_dict)
results_df.insert(loc=0, column='Model', value=['Simple ANN', 'RNN with LSTM'])
results_df

Unnamed: 0,Model,precision_score,recall_score,f1_score,roc_auc_score,accuracy_score
0,RNN with LSTM,0.588889,0.630952,0.609195,0.644429,0.642105
1,Simple ANN,0.580247,0.559524,0.569697,0.662174,0.626316


In [16]:
# save results
results_df.to_csv("./results/neural_networks_results.csv")