In [None]:
# Importing required modules.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
keras = tf.keras
layers = tf.keras.layers
from sklearn import feature_selection, impute, preprocessing, model_selection, linear_model, ensemble, svm, metrics, neighbors
from IPython.display import Image

# MLP Classifier (Main Code)

## Data Preprocessing

In [None]:
# Reading the training CSV file into a Pandas dataframe.
data = pd.read_csv('training2.csv')

In [None]:
# Extracting relevant columns from dataframe and converting them to NumPy arrays.

## this array contains only CaffeNet CNN features for all training samples.
features_cnn = data.iloc[:,0:2048]
features_cnn = features_cnn.to_numpy()

## this array contains the class labels.
labels = data.iloc[:,-2]
labels = labels.to_numpy()

## this array contains the confidence labels.
conf = data.iloc[:,-1]
conf = conf.to_numpy()

In [None]:
# Imputing missing data in the training dataset.
imputer = impute.KNNImputer()
X_train = imputer.fit_transform(features_cnn)

In [None]:
# Normalizing features by scaling them to the standard normal distribution (mean 0, variance 1).
scaler = preprocessing.StandardScaler()

fcnn_norm = scaler.fit_transform(features_cnn)

In [None]:
# Splitting the dataset into training and validation sets, with 80% of the samples being training data and 20% being validation data.
fcnn_train = X_train[:-500]
labels_train = labels[:-500]
conf_train = conf[:-500]

fcnn_val = X_train[-500:]
labels_val = labels[-500:]
conf_val = conf[-500:]

# Generating TensorFlow datasets for training and validation. 
train_dataset = tf.data.Dataset.from_tensor_slices((fcnn_train, labels_train, conf_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)

val_dataset = tf.data.Dataset.from_tensor_slices((fcnn_val, labels_val, conf_val))
val_dataset = val_dataset.batch(64)

## Building and Training the Model

In [None]:
# Building the MLP classifier using the Keras sequential API.
def build_model():
    layers_list = [layers.Dense(128, input_dim = 2048, activation = 'relu', activity_regularizer=tf.keras.regularizers.L2(0.01)),
                   layers.BatchNormalization(),
                   layers.Dropout(0.4),
                   layers.Dense(1, activation = 'sigmoid', activity_regularizer=tf.keras.regularizers.L2(0.01))]
    model = keras.Sequential(layers_list)
    return model

mlp_model = build_model()

mlp_model.compile(loss = 'binary_crossentropy', 
              optimizer = keras.optimizers.Adam(learning_rate=0.001), 
              weighted_metrics = ['accuracy'])

callback = keras.callbacks.EarlyStopping(monitor = 'val_loss', # EarlyStopping callback when validation loss stops decreasing, to prevent overfitting.
                                         patience = 3, 
                                         mode = 'min')

mlp_model.fit(train_dataset,
                epochs = 20, 
                validation_data = val_dataset,
                callbacks = [callback])

In [None]:
# Plotting a schematic diagram of the MLP model for visualization.
keras.utils.plot_model(mlp_model, "mlp_model.png", show_shapes=True)
Image(retina=True, filename='mlp_model.png')

## Predicting Test Data

In [None]:
# Reading the test dataset CSV file into a Pandas dataframe, and extracting the CaffeNet CNN features into a NumPy array.
test_data = pd.read_csv('test.csv')

X_test = test_data.iloc[:,:2048]
X_test = X_test.to_numpy()

In [None]:
# Imputing missing feature values using KNNImputer, followed by scaling of the data to a standard normal distribution.
X_test_imputed = imputer.fit_transform(X_test)
X_test_norm = scaler.fit_transform(X_test_imputed)

In [None]:
# Predicting the test data, followed by rounding to the nearest integer to return 1s and 0s.
predictions = mlp_model.predict(X_test_norm)
predictions = np.rint(predictions)

In [None]:
labels_df = pd.DataFrame(predictions) # Converting label predictions array into Pandas dataframe.
labels_df.to_csv('predictions1.csv') # Exporting predictions to CSV file for submission.

# Appendix (Auxiliary Code)

* Not used for final predictions.

# Data Preprocessing

In [None]:
# Extracting the GIST features, labels, and confidence labels from the training dataset, as seen above for the CNN features.

## this array contains only gist features (256) for all 500 training samples.
features_gist = data.iloc[:,2048:-2]
features_gist = features_gist.to_numpy()

## this array contains the class labels.
labels = data.iloc[:,-2]
labels = labels.to_numpy()

## this array contains the confidence labels.
conf = data.iloc[:,-1]
conf = conf.to_numpy()

In [None]:
# Imputing missing feature values.

imputer = impute.KNNImputer()

gist_imputed = imputer.fit_transform(features_gist)

In [None]:
# Scaling feature values to a standard normal distribution.

scaler = preprocessing.StandardScaler()

gist_norm = scaler.fit_transform(gist_imputed)

# Univariate Feature Selection

In [None]:
# Using the SelectKBest method to obtain F-scores of all GIST features.

selector = feature_selection.SelectKBest(feature_selection.f_classif, k = 'all')
selected_features = selector.fit_transform(gist_norm, labels)

# Sorting F-scores in descending order.

sorted_scores = np.sort(selector.scores_)
sorted_scores_descending = sorted_scores[::-1]

In [None]:
# Plotting a chart of features and their F-scores.

plt.plot(selector.scores_)
plt.xlabel("Feature")
plt.ylabel("F score")
plt.show()

In [None]:
# Plotting a chart of F-scores in descending order, with a vertical line representing the feature selection cutoff point.

plt.plot(sorted_scores_descending)
plt.axvline(x = 80, color = 'r')
plt.xlabel("Feature")
plt.ylabel("F score")
plt.show()

In [None]:
f_score_indexes = (-selector.scores_).argsort()[:80] # Obtaining the indexes of the 80 most important features with the highest F-scores.
f_score_indexes.sort()

selected_gist = gist_norm[:,f_score_indexes] # Obtaining an array of selected features.
print(f'Before feature selection, training data had shape: {gist_norm.shape}. After feature selection, training data has shape: {selected_features.shape}.')

# Algorithm Selection

Using 5x2-fold nested CV.

In [None]:
# Initializing the classifier objects.

clf_svm = svm.SVC(random_state=0)
clf_rf = ensemble.RandomForestClassifier(random_state=0)
clf_lr = linear_model.LogisticRegression(random_state=0)
clf_knn = neighbors.KNeighborsClassifier()

# Creating hyperparameter grids for hyperparameter tuning of each classifier.

param_grid_svm = {'kernel' : ['rbf','poly','sigmoid'], 
                  'C' : np.power(10., np.arange(-4,4)),
                  'gamma' : np.power(10., np.arange(-4,0))}

param_grid_rf = {'criterion' : ['gini','log_loss'],
                 'n_estimators' : [10, 100, 200, 500, 1000, 5000, 10000]}

param_grid_lr = {'solver' : ['lbfgs','saga','liblinear'],
                 'C' : np.power(10., np.arange(-4,4))}

param_grid_knn = {'n_neighbors' : [1, 2, 5]}

In [None]:
# Creating the grid searches for hyperparameter tuning (inner fold of the nested CV).

inner_fold = model_selection.StratifiedKFold(n_splits = 2, # Creating the inner folds for nested CV.
                                             shuffle = True, 
                                             random_state = 0)

gridcv = {}

for grid, model, name in zip((param_grid_svm, param_grid_rf, param_grid_lr, param_grid_knn),
                             (clf_svm, clf_rf, clf_lr, clf_knn),
                             ('Support Vector Machine', 'Random Forest', 'Logistic Regression', 'K Nearest Neighbors')):
    
    result = model_selection.GridSearchCV(estimator = model, # Performing GridSearchCV for each classifier-hyperparameter grid set.
                          param_grid = grid, 
                          scoring = 'accuracy', 
                          n_jobs = -1, 
                          cv = inner_fold, 
                          verbose = 0, 
                          refit = True)
    
    gridcv[name] = result # Stores results of tuned model in a dictionary as values, identified by its model name.
    
outer_fold = model_selection.StratifiedKFold(n_splits = 5, # Creating the outer folds for nested CV.
                                             shuffle = True, 
                                             random_state = 0)

In [None]:
# Performing the outer 5-fold cross-validation of the algorithm selection process. 

for name, model in gridcv.items():
    
    scores = model_selection.cross_validate(model,
                                            X = selected_gist,
                                            y = labels,
                                            cv = outer_fold, 
                                            return_estimator = True,
                                            n_jobs = -1)
    
    print('------------------------------------------------\n')
    print(f'Model Algorithm: {name}')
    print('Inner Fold:')
    
    for i in range(scores['test_score'].shape[0]):
        
        print('\n        Best ACC (avg. of inner test folds) %.2f%%' % (scores['estimator'][i].best_score_ * 100))
        print('        Best parameters:', scores['estimator'][i].best_estimator_)
        print('        ACC (on outer test fold) %.2f%%' % (scores['test_score'][i]*100))

    print('\n%s | outer ACC %.2f%% +/- %.2f' % 
          (name, scores['test_score'].mean() * 100, 
           scores['test_score'].std() * 100))