# Monks 1

In [1]:
# Importing libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report

**Defining useful function**

In [None]:
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.4f} (std: {1:.4f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

**Preprocessing of our data.** 
* We load our data ('Monks-1') and we transform it through oneHot encoder.

In [None]:
# Define encoders
encoder = OneHotEncoder()
lencoder = LabelEncoder()

In [None]:
# Import data set
feature_names = ['class', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'id']
df = pd.read_csv('data/monks_1_train.txt', sep = ' ', header = None, names = feature_names, skipinitialspace = True)
x_train = df.filter(['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], axis = 1)
y_train = df.filter(['class'], axis = 1)

df_test = pd.read_csv('data/monks_1_test.txt', sep = ' ', header = None, names = feature_names, skipinitialspace = True)
x_test = df_test.filter(['a1', 'a2', 'a3', 'a4', 'a5', 'a6'], axis = 1)
y_test = df_test.filter(['class'], axis = 1)

In [None]:
# Convert to np array
x_train = x_train.to_numpy()
y_train = y_train.to_numpy().squeeze()
x_test = x_test.to_numpy()
y_test = y_test.to_numpy().squeeze()

In [None]:
encoder.fit(x_train)
x_train = encoder.transform(x_train).toarray().astype(int)
x_test = encoder.transform(x_test).toarray().astype(int)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB

In [None]:
clf = CategoricalNB()

In [None]:
start = time.time()

param_list = {
    'alpha': [x/10 for x in range(1, 10001)]

}
grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Accuracy %s' % accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# KNN (Sci-Kit Learn)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier()

In [None]:
start = time.time()

param_list = {'n_neighbors': [x for x in range(1, 81)],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]
}

grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Accuracy %s' % accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# SVM (Sci-Kit Learn)

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC()

In [None]:
start = time.time()

param_list = {'C': [x for x in range(1, 101)],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['auto', 'scale'],
              'degree': [x for x in range(1, 11)]
}
grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 2, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Accuracy %s' % accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Random Forest (Sci-Kit Learn)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()

In [None]:
start = time.time()

param_list = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None] + [x for x in range(1, 21)],
    'min_samples_leaf': [x for x in range(1, 21)],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 2, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_pred))

# Multilayer Perceptron (Sci-Kit Learn)

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier()

In [None]:
start = time.time()

param_list = {'hidden_layer_sizes': [(2,), (3,), (4,), (2, 2)],
              'solver': ['sgd'],
              'activation': ['relu'],
              'learning_rate_init': [0.0001, 0.001, 0.01] + [x/10 for x in range(1, 11)], # eta in our notation
              'momentum': [0, 0.01] + [x/10 for x  in range(1, 10)], # alpha in our notation
              'nesterovs_momentum': [False],
              'max_iter': [10000],
              'n_iter_no_change': [200],
              'alpha': [0], # L2 regularisation / lambda in our notation
              'shuffle': [True]
}

grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Accuracy %s' % accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Keras wrapper

In [None]:
from scikeras.wrappers import KerasClassifier

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import regularizers

def create_model(hidden_layers_sizes, lambda_reg, activation):
    n_input_ = 17
    model = Sequential()
    model.add(Input(shape = (n_input_, )))
    for hidden_layer_size in hidden_layers_sizes:
        model.add(Dense(hidden_layer_size, activity_regularizer = regularizers.L2(lambda_reg), activation = activation))
    model.add(Dense(1, activation = 'sigmoid'))
    return model

In [None]:
clf = KerasClassifier(
    model=create_model,
    optimizer = 'sgd',
    activation = 'relu',
    loss = "mean_squared_error",
    metrics = 'binary_accuracy',grid_search.best_estimator_.
    hidden_layers_sizes=(4,),
    lambda_reg = 0,
    optimizer__momentum = 0.5,
    optimizer__learning_rate = 0.7,
    epochs = 1000,
    callbacks = tf.keras.callbacks.EarlyStopping,
    callbacks__monitor="val_loss",
    callbacks__patience = 50,
    callbacks__min_delta = 0.0001,
    callbacks__restore_best_weights = True
)

In [None]:
clf.get_params()

In [None]:
start = time.time()

param_list = {
    'hidden_layers_sizes': [(2,), (3,), (4,), (2,2)],
    'optimizer': ['sgd'],
    'optimizer__learning_rate': [0.0001, 0.001, 0.01] + [x/10 for x in range(1, 11)],
    'optimizer__momentum': [0, 0.01] + [x/10 for x  in range(1, 10)],
    'epochs': [1000],
    'lambda_reg': [0],    
    'shuffle': [True],
    'loss': ['mean_squared_error']
}

grid_search = GridSearchCV(estimator = clf, param_grid = param_list,
                           n_jobs = -1, verbose = 5, scoring='accuracy', refit = True,
                           cv = KFold(n_splits = 5, shuffle = True, random_state = 32))

grid_search.fit(x_train, y_train)
res = grid_search.cv_results_

duration = time.time() - start 
print(f'Executed in {duration // 3600:.0f} hours {(duration % 3600)//60:.0f} minutes {duration % 60:.6f} seconds.')

In [None]:
report(res, n_top = 5)

In [None]:
grid_search.best_estimator_.get_params()

In [None]:
y_pred = grid_search.predict(x_train)
print('Accuracy %s' % accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(x_test)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
history = grid_search.best_estimator_.history_

# summarize history for loss
plt.plot(history['loss'])
plt.plot(history['val_loss'],linestyle="dashed",color='red')
plt.ylabel('MSE',fontsize=20)
plt.xlabel('epoch',fontsize=4)
plt.xlim(0,500)
plt.legend(['training', 'test'], loc='upper right')
plt.show()

#summarize history for accuracy
plt.plot(history['binary_accuracy'])
plt.plot(history['val_binary_accuracy'])
plt.ylabel('Accuracy',fontsize=15)
plt.xlabel('Epoch',fontsize=15)
plt.xlim(0,500)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(['TRAINING', 'TEST'], loc='lower right')
plt.savefig("Acc_monks1")
plt.show()