In [1]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import os, sys # For filepaths
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import label_binarize
import pickle
import sklearn.preprocessing as pre



In [2]:
''' Add the datasets and libraries to the system path '''

# Find the path to our implementations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
home_directory = os.path.dirname(parent_directory)
libraries_path = os.path.join(home_directory, 'Libraries')

# Find the path to the datasets
datasets_path = os.path.join(home_directory, 'Datasets')

# Add them both to the system path
sys.path.append(datasets_path)
sys.path.append(libraries_path)

In [3]:
''' Load the dataset '''

dataset = np.load(os.path.join(datasets_path, 'fashion_train.npy'))
print("Dataset shape: ", dataset.shape)
dataset_test = np.load(os.path.join(datasets_path, 'fashion_test.npy'))
print("Dataset shape: ", dataset_test.shape)

Dataset shape:  (10000, 785)
Dataset shape:  (5000, 785)


In [4]:
X = dataset[:, :-1]  # Define the images as all columns except the last one
y = dataset[:, -1] # Define the labels as the last column
X_test = dataset_test[:, :-1]
y_test = dataset_test[:, -1]

In [5]:
print(type(X))
print("X shape: ", X.shape)
print(type(y))
print("y shape: ", y.shape)
print(type(X_test))
print("X_test shape: ", X_test.shape)
print(type(y_test))
print("y_test shape: ", y_test.shape)

<class 'numpy.ndarray'>
X shape:  (10000, 784)
<class 'numpy.ndarray'>
y shape:  (10000,)
<class 'numpy.ndarray'>
X_test shape:  (5000, 784)
<class 'numpy.ndarray'>
y_test shape:  (5000,)


## Cross validation

In [6]:
# scaling the images
scaler = pre.StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [7]:
# import classifiers 
from sklearn import svm 
from CNN import CNN
from Bayes import Bayes_classifier
from sklearn.neighbors import KNeighborsClassifier as KNN

In [8]:
def accuracy(predictions, y_val): 
    correct_predictions = 0
    
    for yt, yp in zip(y_val, predictions):
        if yt == yp:
            correct_predictions += 1
    return correct_predictions/predictions.shape[0]


# CNN Hyperparameter Optimization

In [9]:
import optuna
from optuna.samplers import TPESampler

def objective(trial):
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    lr = trial.suggest_float('lr', 1e-4, 1e-2)
    epochs = trial.suggest_int('epochs', 10, 40)
    step_size = trial.suggest_int('step_size', 1, 10)
    gamma = trial.suggest_float('gamma', 1e-4, 1e-2)

    # Create a model with the hyperparameters above.
    model = CNN(lr=lr, epochs=epochs, step_size=step_size, gamma=gamma)
    
    # Train the model and evaluate it.
    model.fit(X, y)
    model.evaluate(X_test, y_test)
    accuracy = model.accuracy

    return accuracy

# Create a study object and optimize the objective function. use sampler
study_name = 'CNN'
storage_name = 'sqlite:///{}.db'.format(study_name)
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True, sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)


# Print the optimization results.
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best accuracy: {}'.format(study.best_value))
print("Best hyperparameters: {}".format(study.best_params))

  0%|          | 0/100 [00:00<?, ?it/s]

Number of finished trials: 100
Best accuracy: 0.8928
Best hyperparameters: {'lr': 0.0021341285880589088, 'epochs': 36, 'step_size': 10, 'gamma': 0.007925633802563264}


In [10]:
study = optuna.load_study(study_name='CNN', storage='sqlite:///CNN.db')
optuna.visualization.plot_optimization_history(study)

In [11]:
optuna.visualization.plot_param_importances(study)

In [14]:
optuna.visualization.plot_slice(study)

In [17]:
optuna.visualization.plot_parallel_coordinate(study)

# CNN cross validation

In [12]:
best_params_from_optuna = study.best_params
lr = best_params_from_optuna['lr']
epochs = best_params_from_optuna['epochs']
step_size = best_params_from_optuna['step_size']
gamma = best_params_from_optuna['gamma']

print('Using hyperparameters: {}'.format(study.best_params))

model_acc = np.mean(cross_val_score(CNN(epochs=epochs,lr=lr,step_size=step_size,gamma=gamma), X, y, cv=5, scoring = 'accuracy', n_jobs=-1))
print('Accuracy:', model_acc)
model_prc = np.mean(cross_val_score(CNN(epochs=epochs,lr=lr,step_size=step_size,gamma=gamma), X, y, cv=5, scoring = 'precision_macro', n_jobs=-1))
print('Precision:', model_prc)
model_f1m = np.mean(cross_val_score(CNN(epochs=epochs,lr=lr,step_size=step_size,gamma=gamma), X, y, cv=5, scoring = 'f1_macro', n_jobs=-1))
print('F1:', model_f1m)
model_rec = np.mean(cross_val_score(CNN(epochs=epochs,lr=lr,step_size=step_size,gamma=gamma), X, y, cv=5, scoring = 'recall_macro', n_jobs=-1))
print('Recall:', model_rec)

Using hyperparameters: {'lr': 0.0021341285880589088, 'epochs': 36, 'step_size': 10, 'gamma': 0.007925633802563264}
Accuracy: 0.8945000000000001
Precision: 0.8941698797489501
F1: 0.8951886788837624
Recall: 0.8950572565362227


In [13]:
# Formatting the results with 3 decimals
results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1'])
results.loc['CNN'] = [model_acc, model_prc, model_rec, model_f1m]
results = results.round(3)
print(results)

     Accuracy  Precision  Recall     F1
CNN     0.895      0.894   0.895  0.895


Saving info

Test 1
Using hyperparameters: {'lr': 0.0025474653904364957, 'epochs': 28, 'step_size': 10, 'gamma': 0.08156406080977326}
0.895	0.897	0.897	0.897

Test 2
Using hyperparameters: {'lr': 0.003666560508915495, 'epochs': 19, 'step_size': 9, 'gamma': 0.09538566057024735}
0.895	0.898	0.895	0.894

Test 3
Using hyperparameters: {'lr': 0.0024630158876938614, 'epochs': 28, 'step_size': 10, 'gamma': 0.009867616843669264}
0.892      0.894   0.895  0.895

Test 4
Using hyperparameters: {'lr': 0.0025854245650713636, 'epochs': 18, 'step_size': 8, 'gamma': 0.036209974185880886}
0.892      0.895   0.896  0.895

Test 5
Using hyperparameters: {'lr': 0.0021341285880589088, 'epochs': 36, 'step_size': 10, 'gamma': 0.007925633802563264}
0.895      0.894   0.895  0.895

# SVM cross validation

In [18]:
np.mean(cross_val_score(svm.SVC(C=10), X, y, cv=5, scoring = 'accuracy'))

0.8710000000000001

In [19]:
np.mean(cross_val_score(svm.SVC(C=10), X, y, cv=5, scoring = 'precision_macro'))

0.872764949986182

In [20]:
np.mean(cross_val_score(svm.SVC(C=10), X, y, cv=5, scoring = 'f1_macro'))

0.8720073335631569

In [21]:
np.mean(cross_val_score(svm.SVC(C=10), X, y, cv=5, scoring = 'recall_macro'))

0.8718040478477252

# Bayes cross validation

In [8]:
bandwidth = 1.06 * np.std(X) * np.power(X.shape[0], -1/5)

In [11]:
acc = np.mean(cross_val_score(Bayes_classifier(bandwidth), X, y, cv=5, scoring = 'accuracy', n_jobs = -1))
prec_macro = np.mean(cross_val_score(Bayes_classifier(bandwidth), X, y, cv=5, scoring = 'precision_macro',n_jobs = -1))
f1 = np.mean(cross_val_score(Bayes_classifier(bandwidth), X, y, cv=5, scoring = 'f1_macro',n_jobs = -1))
recall = np.mean(cross_val_score(Bayes_classifier(bandwidth), X, y, cv=5, scoring = 'recall_macro',n_jobs = -1))

print(acc)
print(prec_macro)
print(f1)
print(recall)

0.7266999999999999
0.733129978311079
0.7264291749776655
0.7282642246824323


In [8]:
# Define classifiers
classifiers = [Bayes_classifier()]
classifier_names = ["Bayes"]
num_classifiers = len(classifiers)
num_folds = 5
bandwidth = 1.06 * np.std(X_train) * np.power(X_train.shape[0], -1/5)
group_kfold = KFold(n_splits=num_folds)

# Define functions to avoid code repetition
def get_metrics(classifiers, x, y):
    acc_val = np.empty([num_folds, num_classifiers])
    f1_val = np.empty([num_folds, num_classifiers])
    precision = np.empty([num_folds, num_classifiers])
    recall = np.empty([num_folds, num_classifiers])
    # roc_auc = np.empty([num_folds, num_classifiers])
    for i, (train_index, val_index) in enumerate(group_kfold.split(x, y)):
        x_train, x_val = x[train_index, :], x[val_index, :]
        y_train, y_val = y[train_index], y[val_index]
        print(x_val.shape)
        for j, clf in enumerate(classifiers):
            print(f"Classifier {j+1} - {classifier_names[j]}")
            clf.fit(x_train, y_train)
            predictions = clf.predict(x_val)
            acc_val[i, j] = accuracy_score(y_val, predictions)
            f1_val[i, j] = f1_score(y_val, predictions, average='micro')
            precision[i, j] = precision_score(y_val, predictions, zero_division=0, average='micro')
            recall[i, j] = recall_score(y_val, predictions, average='micro')
    print_metrics(acc_val, f1_val, precision, recall)
    return x_train, y_train, x_val, y_val

def print_metrics(acc_val, f1_val, precision, recall):
    average_acc = np.mean(acc_val, axis=0)
    average_f1 = np.mean(f1_val, axis=0)
    average_precision = np.mean(precision, axis=0)
    average_recall = np.mean(recall, axis=0)
    for i, classifier_name in enumerate(classifier_names):
        print(f"############ Classifier {i+1} - {classifier_name}:")
        print(f'Average F1 score = {average_f1[i]:.3f}')
        print(f'Average Accuracy = {average_acc[i]:.3f}')
        print(f'Average Precision = {average_precision[i]:.3f}')
        print(f'Average Recall = {average_recall[i]:.3f}')

get_metrics(classifiers, X,y)

(2000, 784)
Classifier 1 - Bayes
(2000, 2)
(2000, 784)
Classifier 1 - Bayes
(2000, 2)
(2000, 784)
Classifier 1 - Bayes
(2000, 2)
(2000, 784)
Classifier 1 - Bayes
(2000, 2)
(2000, 784)
Classifier 1 - Bayes
(2000, 2)
############ Classifier 1 - Bayes:
Average F1 score = 0.228
Average Accuracy = 0.228
Average Precision = 0.228
Average Recall = 0.228


(array([[-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        ...,
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531]]),
 array([0, 1, 0, ..., 4, 3, 1], dtype=uint8),
 array([[-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        [-0.0148165 , -0.03760239, -0.06245122, ..., -0.14119802,
         -0.06592056, -0.02908531],
        ...,
        

In [None]:
# Normal feature selection
print("Running without feature selection")
x_train, y_train, x_val, y_val = get_metrics(classifiers, x, y, patient_id)

# Feature selection with variance threshold
print("Running with variance threshold feature selection")
threshold_value = 0.1
selector = VarianceThreshold(threshold=threshold_value)
x_selected = selector.fit_transform(x)
get_metrics(classifiers, x_selected, y, patient_id, "With variance threshold")

# Feature selection with PCA
print("Running with PCA feature selection")
pca_transformer = PCA(n_components=5)
x_pca = pca_transformer.fit_transform(x)
get_metrics(classifiers, x_pca, y, patient_id, "With PCA")

# Define the path where to save the file
pickle_path = 'Project-2_github_repo/Fixed/PROJECT/fyp2023/model_group02'

# Chosen classifier
classifier = KNN(n_neighbors=5)
classifier = classifier.fit(x, y)
filename = 'group02_classifier.sav'

# Check if the path exists, if not, create it
if not os.path.exists(pickle_path):
    os.makedirs(pickle_path)

pickle.dump(classifier, open(os.path.join(pickle_path, filename), 'wb'))


# Plot ROC curve for best classifier
clf = classifier
clf.fit(x_train, y_train)
y_score = clf.predict_proba(x_val)
fpr, tpr, _ = roc_curve(y_val, y_score[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Receiver Operating Characteristic")
plt.legend(loc="lower right")
plt.show()