# Workflow for Applying and Comparing Classifiers

The code below demonstrates a practical workflow for:
1. Applying and comparing multiple classifiers.
2. Analyzing feature importance.
3. Using the best-performing model to classify unlabeled data.

## Classifiers to Implement
The following classifiers will be tested:
1. Logistic Regression
2. Support Vector Machine (SVM)
3. Random Forest
4. Gradient Boosting
5. Multilayer Perceptron (MLP)

## Steps to Implement
- Preprocess the data (handle missing values, scale features, encode labels).
- Train and evaluate each classifier.
- Use visualizations to show performance changes with increasing complexity.
- Analyze feature importance using Ridge/Lasso regression.
- Choose the most suitable classifier for the task and justify the choice.


In [None]:
#Import necessary modules and packages

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import seaborn as sb

In [None]:
#load in the dataset
tr_performance = pd.read_csv("data-performance.csv")

#Right off the bat weight (and age?) seems to be the defining factor

var_names = tr_performance.columns.to_numpy()[0:11]
tr_performance['gender'].replace('F', '0.0', inplace = True)
tr_performance['gender'].replace('M', '1.0', inplace = True)

In [None]:
#Train models and evaluate performance (Optimized parameters below have been fed into this cell)

# Split data
X = tr_performance.iloc[:,0:11].to_numpy()
y = tr_performance.iloc[:,11].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y, random_state=42)

#Scale
scalerX_train = preprocessing.StandardScaler().fit(X_train)
X_train_s = scalerX_train.transform(X_train)

scalerX_test = preprocessing.StandardScaler().fit(X_test)
X_test_s = scalerX_test.transform(X_test)

#Logistic Regression
logreg = LogisticRegression(max_iter=100000, C=100).fit(X_train_s, y_train)
print("Logreg Training set score: {:.2f}".format(logreg.score(X_train_s, y_train)))
print(" Logreg Test set score: {:.2f}".format(logreg.score(X_test_s, y_test)))

lg_pred = logreg.predict(X_test_s)
print(classification_report(y_test, lg_pred))

#Random Forest
forest = RandomForestClassifier(n_estimators=500, random_state=0, max_depth = 15, max_features= None, min_samples_leaf =  1, min_samples_split = 10)
forest.fit(X_train_s, y_train)
print(" RF Accuracy on training set: {:.2f}".format(forest.score(X_train_s, y_train)))
print(" RF Accuracy on test set: {:.2f}".format(forest.score(X_test_s, y_test)))

rf_pred = forest.predict(X_test_s)
print(classification_report(y_test, rf_pred))


# #Gradient Boosting
gbrt = GradientBoostingClassifier(random_state=0, learning_rate = 1)
gbrt.fit(X_train_s, y_train)
print("GB Accuracy on training set: {:.2f}".format(gbrt.score(X_train_s, y_train)))
print(" GB Accuracy on test set: {:.2f}".format(gbrt.score(X_test_s, y_test)))
gb_pred = gbrt.predict(X_test_s)
print(classification_report(y_test, gb_pred))


# #svc
svc = SVC(kernel='rbf', C=10, gamma = 'auto', degree = 2)
svc.fit(X_train_s, y_train)
print("SVC Accuracy on training set: {:.2f}".format(svc.score(X_train_s, y_train)))
print("SVC Accuracy on test set: {:.2f}".format(svc.score(X_test_s, y_test)))
svc_pred = svc.predict(X_test_s)
print(classification_report(y_test, svc_pred))


# #Multi-layer Perceptron
mlp = MLPClassifier(random_state=42, max_iter=1000, hidden_layer_sizes=[50, 50], alpha = 0.01, learning_rate_init = 0.001)
mlp.fit(X_train_s, y_train)
print("MLP Accuracy on training set: {:.2f}".format(mlp.score(X_train_s, y_train)))
print("MLP Accuracy on test set: {:.2f}".format(mlp.score(X_test_s, y_test)))
mlp_pred = mlp.predict(X_test_s)
print(classification_report(y_test, mlp_pred))

In [None]:
#Cross-validation
scores_lg = cross_val_score(logreg, X, y, cv=5)
print("Cross-validation scores: {}".format(scores_lg))

scores_rf = cross_val_score(forest, X, y, cv=5)
print("Cross-validation scores: {}".format(scores_rf))

scores_gb = cross_val_score(gbrt, X, y, cv=5)
print("Cross-validation scores: {}".format(scores_gb))

scores_svc = cross_val_score(svc, X, y, cv=5)
print("Cross-validation scores: {}".format(scores_svc))

scores_mlp = cross_val_score(mlp, X, y, cv=5)
print("Cross-validation scores: {}".format(scores_mlp))

print("Average cross-validation score: {:.2f}".format(scores_lg.mean()))
print("Average cross-validation score: {:.2f}".format(scores_rf.mean()))
print("Average cross-validation score: {:.2f}".format(scores_gb.mean()))
print("Average cross-validation score: {:.2f}".format(scores_svc.mean()))
print("Average cross-validation score: {:.2f}".format(scores_mlp.mean()))

res_lg = cross_validate(logreg, X, y, cv=5, return_train_score=True)
res_rf = cross_validate(forest, X, y, cv=5, return_train_score=True)
res_gb = cross_validate(gbrt, X, y, cv=5, return_train_score=True)
res_svc = cross_validate(svc, X, y, cv=5, return_train_score=True)
res_mlp = cross_validate(mlp, X, y, cv=5, return_train_score=True)

res_lg_df = pd.DataFrame(res_lg)
res_rf_df = pd.DataFrame(res_rf)
res_gb_df = pd.DataFrame(res_gb)
res_svc_df = pd.DataFrame(res_svc)
res_mlp_df = pd.DataFrame(res_mlp)

display(res_lg_df)
display(res_rf_df)
display(res_gb_df)
display(res_svc_df)
display(res_mlp_df)


In [None]:
# #Hyperparameter optimization
# #Code has been commented as it will take hours to run the GridSearch. It can be un-commented and run if required. 

# #Logistic Regression
# param_grid_lg = {
#     'C': [0.01, 0.1, 1, 10, 100, 1000]
# }

# grid_search_lg = GridSearchCV(LogisticRegression(random_state=42, max_iter=10000), param_grid_lg, cv=5)
# grid_search_lg.fit(X_train_s, y_train)
# print("Best parameters:", grid_search_lg.best_params_)
# print("Best cross-validation score:", grid_search_lg.best_score_)

# #Gradient Boosting
# param_grid_gb = {
#     'learning_rate': [0.01, 0.1, 1, 10, 100, 1000],
#     'n_estimators': [100, 500, 1000],
#      'max_depth': [1,3, 5, 7],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 5],
#      'max_features': ['sqrt', 'log2', None]
# }

# grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5)
# grid_search_gb.fit(X_train_s, y_train)
# print("Best parameters:", grid_search_gb.best_params_)
# print("Best cross-validation score:", grid_search_gb.best_score_)

# #SVM
# param_grid_svm = {
#     'C': [0.1, 1, 10, 100],  
#     'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  
#     'gamma': ['scale', 'auto'],  
#     'degree': [2, 3, 4]  
# }

# grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5)
# grid_search_svm.fit(X_train_s, y_train)
# print("Best parameters:", grid_search_svm.best_params_)
# print("Best cross-validation score:", grid_search_svm.best_score_)


# #MLP
# param_grid_mlp = {
#     'hidden_layer_sizes': [(50,50), (80, 80), (100, 100), (50,50,50), (80, 80,80), (100, 100,100)],
#     'alpha': [0.0001, 0.001, 0.01],
#     'learning_rate_init': [0.001, 0.01],
# }

# grid_search_mlp = GridSearchCV(MLPClassifier(random_state=42, max_iter=1000), param_grid_mlp, cv=5)
# grid_search_mlp.fit(X_train_s, y_train)
# print("Best parameters:", grid_search_mlp.best_params_)
# print("Best cross-validation score:", grid_search_mlp.best_score_)

# #Random Forest
# param_grid_rf = {
#     'n_estimators': [100, 500, 1000],
#     'max_depth': [5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 5],
#     'max_features': ['sqrt', 'log2', None]
# }

# grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=0), param_grid_rf, cv=5)
# grid_search_rf.fit(X_train_s, y_train)
# print("Best parameters:", grid_search_rf.best_params_)
# print("Best cross-validation score:", grid_search_rf.best_score_)

In [None]:
#Feature importance

def plot_feature_importances_rf(model):
    n_features = X_train_s.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), var_names)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
plot_feature_importances_rf(forest)
print(forest.feature_importances_)

In [None]:
#Feature Selection

new_performance = tr_performance.drop(['age'], axis=1)
print(new_performance.head())


X_new = new_performance.iloc[:,0:10].to_numpy()
y_new = new_performance.iloc[:,10].to_numpy()

print(X_new.shape)
print(y_new)

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size = 0.2, stratify=y_new, random_state=42)

scalerX_train_new = preprocessing.StandardScaler().fit(X_train_new)
X_train_s_new = scalerX_train_new.transform(X_train_new)

scalerX_test_new = preprocessing.StandardScaler().fit(X_test_new)
X_test_s_new = scalerX_test_new.transform(X_test_new)

forest_new = RandomForestClassifier(n_estimators=500, random_state=0, max_depth = 15, max_features= None, min_samples_leaf =  1, min_samples_split = 10)
forest_new.fit(X_train_s_new, y_train_new)
print("Accuracy on training set: {:.3f}".format(forest_new.score(X_train_s_new, y_train_new)))
print("Accuracy on test set: {:.3f}".format(forest_new.score(X_test_s_new, y_test_new)))

#Dropping different features made no improvements to test scores, it lowered them.

In [None]:
#Graphs and Charts for the report

x_bar = tr_performance.iloc[:,11]
df_classes = pd.DataFrame({'Class': x_bar})
custom_order = ['A', 'B', 'C', 'D'] 
sb.countplot(data=df_classes, x='Class', palette = 'PuBuGn', order=custom_order)
plt.title("Number of Individuals in Each Class", fontsize=12)
plt.xlabel("Class", fontsize=10)
plt.ylabel("Number of Individuals", fontsize=10);

In [None]:
class_counts = df_classes['Class'].value_counts(normalize=True) * 100 
cp1 = pd.DataFrame(class_counts.index, columns=['Class'])
cp2 = pd.DataFrame(class_counts.values, columns=['Percentages'])
class_percentages = pd.merge(cp1, cp2, left_index=True, right_index=True)
class_percentages.head()
sb.barplot(data=class_percentages, x='Class', y='Percentages', palette='PuBuGn', order=custom_order)
plt.title("Percentage of Individuals in Each Class", fontsize=12)
plt.xlabel("Class", fontsize=10)
plt.ylabel("Percentage (%)", fontsize=10);


In [None]:
sb.pairplot(tr_performance, hue="class");

In [None]:
#PCA analysis visualization

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

data = pd.read_csv('data-performance.csv')

print(data.head())

# Encode 'gender' and 'class' columns
le_gender = LabelEncoder()
le_class = LabelEncoder()
data['gender'] = le_gender.fit_transform(data['gender'])  # Encode gender
data['class_encoded'] = le_class.fit_transform(data['class'])  # Encode class

# Separate features and target
X = data.drop(['class', 'class_encoded'], axis=1)
y = data['class_encoded']

print(y.head())

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce to 2 dimensions
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Visualize the clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, ticks=range(len(le_class.classes_)), label='Class')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA Visualization of Class Clusters')
plt.grid(alpha=0.3)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE



label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Converts ['B', 'C', 'D'] to numeric labels like [0, 1, 2]

#Perform k-Means Clustering
n_clusters = len(np.unique(y_train_encoded))  # Set the number of clusters to the number of unique classes
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(X_train_s)


# Initialize t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300)
X_tsne = tsne.fit_transform(X_train_s)

#Plot t-SNE visualization with k-Means clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=kmeans_labels, cmap='viridis', s=20)
plt.colorbar(scatter, ticks=range(len(le_class.classes_)), label='k-Means Cluster')
plt.title("t-SNE Visualization of k-Means Clusters", fontsize=14)
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

#Compare k-Means Clusters with True Labels
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_train_encoded, cmap='viridis', s=20)
plt.colorbar(scatter, ticks=range(len(le_class.classes_)),label='True Labels')
plt.title("t-SNE Visualization of True Labels", fontsize=14)
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()




In [None]:
#Predicting on Labelled data

unlabelled_data = pd.read_csv('data-performance-test-unlabelled.csv')


var_names = unlabelled_data.columns.to_numpy()[0:11]
unlabelled_data['gender'].replace('F', '0.0', inplace = True)
unlabelled_data['gender'].replace('M', '1.0', inplace = True)

X_unlabelled = unlabelled_data
X_unlabelled_scaled = scaler.transform(X_unlabelled)

predictions = forest.predict(X_unlabelled_scaled)


unlabelled_data['Predicted Class'] = predictions


print(unlabelled_data.head())

unlabelled_data.to_csv('predicted_data.csv', index=False)



In [None]:
#Graph for report
x_bar_pred = unlabelled_data.iloc[:,11]
df_classes_pred = pd.DataFrame({'Class': x_bar_pred})
custom_order = ['A', 'B', 'C', 'D'] 
sb.countplot(data=df_classes_pred, x='Class', palette = 'PuBuGn', order=custom_order)
plt.title("Number of Individuals in Each Class", fontsize=12)
plt.xlabel("Class", fontsize=10)
plt.ylabel("Number of Individuals", fontsize=10);