# Machine Learning | Multi-Class Classification Project

In [None]:
# Imports
import pandas as pd
import sqlite3 as db
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [None]:
# Connection to sqlite
cnx = db.connect('../data/data.sqlite')

# Loading datasets into pandas dataframes
df_train = pd.read_sql_query("SELECT * FROM train", cnx)
df_unseen = pd.read_sql_query("SELECT * FROM test", cnx)

### Data Exploration

In [None]:
# Looking at top 5 rows of the train dataset
df_train.head()

In [None]:
# Looking at the top 5 rows of the unseen dataset
df_unseen.head()

In [None]:
# Looking at the info for the train dataset
df_train.info()

In [None]:
# Looking at the info for the unseen dataset
df_unseen.info()

The unseen dataset already can a class column with all empty rows which we need to predict. Therefore we should drop this column, when running it through our classifier.

We will now focus on the train dataset

In [None]:
df_train.describe()

In [None]:
# Identifying missing values in df_train
missing_values = df_train.isnull().sum()

# Percentage of missing values
missing_percentage = (df_train.isnull().sum() / len(df_train)) * 100

# Display both count and percentage of missing values
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

print(missing_data)


In [None]:
# Identifying missing values in df_unseen
missing_values = df_unseen.isnull().sum()

# Percentage of missing values
missing_percentage = (df_unseen.isnull().sum() / len(df_unseen)) * 100

# Display both count and percentage of missing values
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

print(missing_data)


In [None]:
# Check for duplicate entries 
print(f"Duplicate Rows: {df_train.duplicated().sum()}")

In [None]:
# Check for duplicate features
print(f"Duplicate Features: {df_train.T.duplicated().sum()}")

In [None]:
## Checking for objects | Non-numeric values and their count
for i in df_train.select_dtypes(include="object").columns:
    print(df_train[i].value_counts())
    print("***"*10)

#### Exploration of Numeric values

In [None]:
# Checking the distribution 

numeric_cols = df_train.select_dtypes(include='number').columns

# Plot histograms
df_train[numeric_cols].hist(bins=15, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
## Box plot to identify outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols):
    plt.subplot(3, (len(numeric_cols) + 2) // 3, i + 1)
    sns.boxplot(y=df_train[col])
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
categorical_cols = df_train.select_dtypes(include='object').columns

# Plot count plots for categorical variables
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols):
    plt.subplot(3, (len(categorical_cols) + 2) // 3, i + 1)
    sns.countplot(y=df_train[col])
    plt.title(col)
plt.tight_layout()
plt.show()

In [None]:
## Visualizing missing value
plt.figure(figsize=(12, 6))
sns.heatmap(df_train.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()


In [None]:

## Checking Feature Corelation | Pearson Correlation
df_train_numeric = df_train.drop(['Storage', 'Music', 'Guitar'], axis=1)

plt.figure(figsize=(25,20))
sns.heatmap(df_train_numeric.corr(), annot=True, cmap="YlGnBu")

### Observations

We have **31 columns** with a lot of missing values and non numeric rows.

* *Music*, *Storage*,  *Guitar* columns need to be converted into numeric form
* Columns with missing values:
  * *Tennis* = 50 rows. Can be guessed
  * *Oven* = 1118 rows.  22.36% values missing. Can be guessed
  * *Office* = 3008 rows. 60.16% values missing. Better to discard column. 
* Big spread of data on some features. | Features are skewed
  * Needs to normalised
* Some features have high correlation
  * Feature selection required
* Some features have high number of outliers
  * If accuracy is hampered, we may try using outlier analysis

### Data Preprocessing

#### Data Cleaning

In [None]:
# Drop unnecessary columns in df_train
df_train_cleaned=df_train.drop(['Office','index'], axis=1)

# Drop unnecessary columns in df_unseen (aligning with df_train_cleaned)
df_unseen_cleaned = df_unseen.drop(['Office', 'index', 'class'], axis=1)

In [None]:
# Separate features (X) and labels (y)
X = df_train_cleaned.drop(columns=['class'])  # Everything except the class column
y = df_train_cleaned['class']  # Only the class column

print("Features and labels separated.")

In [None]:
# Train-test split | 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

#### Encoding

In [None]:

# Applying one-hot encoding to the string columns on the training set
X_train_encoded = pd.get_dummies(X_train, columns=['Music', 'Storage', 'Guitar'])

# Aligning the test set with the training set encoding
X_test_encoded = pd.get_dummies(X_test, columns=['Music', 'Storage', 'Guitar'])

# Ensuring the test set has the same columns as the training set (fill missing columns with 0s)
# Columns should be the same, but this step is just for precaution
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Applying one-hot encoding to the unseen data
df_unseen_encoded = pd.get_dummies(df_unseen_cleaned, columns=['Music', 'Storage', 'Guitar'])

# Ensuring unseen data has the same columns as the training set
df_unseen_encoded = df_unseen_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

#### Imputation

In [None]:
# Imputation of missing values
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_encoded)  # Fit on training data
X_test_imputed = imputer.transform(X_test_encoded)  # Transform test data based on training data

df_unseen_imputed = imputer.transform(df_unseen_encoded)  # Transform unseen data based on training data

In [None]:
# Checking class distribution in the training set
class_counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(class_counts[0], class_counts[1]))
print(class_distribution)

In [None]:
# Checking class distribution in the test set
class_counts = np.unique(y_test, return_counts=True)
class_distribution = dict(zip(class_counts[0], class_counts[1]))
print(class_distribution)

### Handle class imbalance on train set | Oversampling with SMOTE

In [None]:
# Applying SMOTE to handle class imbalance (only to the training set)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)

In [None]:
# Checking class distribution in the train set | After SMOTE
class_counts = np.unique(y_train_resampled, return_counts=True)
class_distribution = dict(zip(class_counts[0], class_counts[1]))
print(class_distribution)

In [None]:
# Checking class distribution in the test set
# To make sure it is still the same
class_counts = np.unique(y_test, return_counts=True)
class_distribution = dict(zip(class_counts[0], class_counts[1]))
print(class_distribution)

### Feature Scaling

In [None]:

# Normalizing the training data and applying the same transformation to the test data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test_imputed)  # Applying the same scaling from the training set

# Normalizing the unseen data using the same scaler
df_unseen_scaled = scaler.transform(df_unseen_imputed)  # Applying the same scaling from the training set


### Feature Selection

In [None]:
# Train RandomForestClassifier to compute feature importances
rf_model_importance = RandomForestClassifier(random_state=42)
rf_model_importance.fit(X_train_scaled, y_train_resampled)


In [None]:
# ---- Feature Importance Ranking and Plotting with Original Names ---- #
def plot_feature_importances(feature_importances, feature_names, num_features=None):

    if num_features is None:
        num_features = len(feature_importances)
    
    sorted_indices = np.argsort(feature_importances)[::-1]
    top_indices = sorted_indices[:num_features]

    plt.figure(figsize=(10, 8))
    plt.title(f"Top {num_features} Feature Importances")
    plt.barh(range(num_features), feature_importances[top_indices], align='center')
    plt.yticks(range(num_features), [feature_names[i] for i in top_indices])
    plt.gca().invert_yaxis()
    plt.xlabel('Importance')
    plt.show()

In [None]:
# Getting feature importances from the RF Model
feature_importances = rf_model_importance.feature_importances_

# Using the original feature names (after one-hot encoding)
feature_names = X_train_encoded.columns

In [None]:
# Plotting the feature importance for all features with original names
plot_feature_importances(feature_importances, feature_names, num_features=len(feature_importances))

In [None]:
# Selects the top N features based on feature importances and prints the selected features.

def select_top_n_features(X_train, X_test, feature_importances, feature_names, n):
    
    # Getting the top N feature indices
    sorted_indices = np.argsort(feature_importances)[::-1]
    top_n_indices = sorted_indices[:n]
    
    # Printing the selected feature names
    selected_features = feature_names[top_n_indices]
    print(f"Top {n} selected features:")
    for feature in selected_features:
        print(feature)
    
    # Select only the top N features for training and test sets
    X_train_top_n = X_train[:, top_n_indices]
    X_test_top_n = X_test[:, top_n_indices]
    
    return X_train_top_n, X_test_top_n


In [None]:
# Using a KNN classifier to check how does the classifier behave with increasing features
# This will help us to identify how many should we use

# Define a range of feature counts to test. We will use all features
n_features_range = range(1, 44)

train_accuracies = []
test_accuracies = []

# Iterate over the range of feature counts
for n_features_to_select in n_features_range:
    # Select top N features for both train and test sets
    X_train_top_n, X_test_top_n = select_top_n_features(X_train_scaled, X_test_scaled, feature_importances, feature_names, n_features_to_select)

    # Initialize the KNN model
    knn = KNeighborsClassifier()

    # Define the hyperparameter grid
    param_grid_knn = {
        'n_neighbors': range(10, 51, 5),
        'weights': ['uniform'],
        'metric': ['euclidean', 'manhattan']
    }

    # Perform grid search with cross-validation
    grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, n_jobs=-1, verbose=1, return_train_score=True)
    grid_search_knn.fit(X_train_top_n, y_train_resampled)

    # Get the best KNN model from the grid search
    best_knn_feature_plotting = grid_search_knn.best_estimator_

    # Evaluate the model on the training and test sets
    train_accuracy_knn = best_knn_feature_plotting.score(X_train_top_n, y_train_resampled)
    test_accuracy_knn = best_knn_feature_plotting.score(X_test_top_n, y_test)

    # Store the accuracies
    train_accuracies.append(train_accuracy_knn)
    test_accuracies.append(test_accuracy_knn)

# Plot the accuracy curves
plt.figure(figsize=(10, 6))
plt.plot(n_features_range, train_accuracies, label='Training Accuracy', marker='o', linestyle='-', color='blue')
plt.plot(n_features_range, test_accuracies, label='Test Accuracy', marker='o', linestyle='-', color='green')

plt.title('KNN Accuracy vs. Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.xticks(n_features_range)
plt.show()


It appears that when we select the top 12 features, we get the maximum test accuracy with minimal distance from the train accuracy. There is a big jump in test accuracy from 10 to 11. However, there is also a small jump from 11 to 12, while the train accuracy only increases slightly.

In [None]:
# Select top 12 features based on importance and print the selected features
n_features_to_select = 12
X_train_top_n, X_test_top_n = select_top_n_features(X_train_scaled, X_test_scaled, feature_importances, feature_names, n_features_to_select)

# Now X_train_top_n and X_test_top_n contain only the top 12 features


### Model training and comparison

#### 1. KNN

In [None]:
# Define the KNN parameter grid
param_grid_knn = {
    'n_neighbors': range(10, 51, 1),
    'weights': ['uniform'],
    'metric': ['euclidean', 'manhattan']
}

# Create a KNN model
knn_model = KNeighborsClassifier()

# Define Stratified K-Fold cross-validator
stratified_kfold = StratifiedKFold(n_splits=10)

# Perform grid search with SKF
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train_top_n, y_train_resampled)

# Get the best model parameters
best_knn_model = grid_search_knn.best_estimator_

# Predict on training and test data
y_train_pred_knn = best_knn_model.predict(X_train_top_n)
y_test_pred_knn = best_knn_model.predict(X_test_top_n)

# Evaluate performance
print("KNN Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_knn))
print("KNN Test Accuracy:", accuracy_score(y_test, y_test_pred_knn))
print("\nKNN Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_knn))
print("\nKNN Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_knn))

# Output the chosen hyperparameters
print("Best hyperparameters for KNN:", grid_search_knn.best_params_)


#### 2. Naive Bayes

In [None]:
# Create a Naive Bayes model
nb_model = GaussianNB()

# Train the Naive Bayes model
nb_model.fit(X_train_top_n, y_train_resampled)

# Predict on training and test data
y_train_pred_nb = nb_model.predict(X_train_top_n)
y_test_pred_nb = nb_model.predict(X_test_top_n)

# Evaluate performance
print("Naive Bayes Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_nb))
print("Naive Bayes Test Accuracy:", accuracy_score(y_test, y_test_pred_nb))
print("\nNaive Bayes Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_nb))
print("\nNaive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_nb))


#### 3. Decision Tree

In [None]:
# Define the Decision Tree parameter grid
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 12, 14],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [5, 6],
    'class_weight': [None, {0: 1.2, 1: 1, 2: 1}]
}

# Create a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

stratified_kfold = StratifiedKFold(n_splits=10)

# Perform grid search with SKF
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
grid_search_dt.fit(X_train_top_n, y_train_resampled)

# Get the best model parameters
best_dt_model = grid_search_dt.best_estimator_

# Predict on training and test data
y_train_pred_dt = best_dt_model.predict(X_train_top_n)
y_test_pred_dt = best_dt_model.predict(X_test_top_n)

# Evaluate performance
print("Decision Tree Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_dt))
print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_test_pred_dt))
print("\nDecision Tree Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_dt))
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_dt))

# Output the chosen hyperparameters
print("Best hyperparameters for Decision Tree:", grid_search_dt.best_params_)


#### 4. Random Forest

In [None]:
# Define the Random Forest parameter grid
param_grid_rf = {
    'n_estimators': [700, 800],
    'criterion': ['gini'],
    'max_depth': [13, 14],
    'min_samples_split': [20],
    'min_samples_leaf': [5, 6],
    'class_weight': [{0: 1.2, 1: 1, 2: 1}]
}

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=42)

stratified_kfold = StratifiedKFold(n_splits=10)

# Perform grid search with SKF
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
grid_search_rf.fit(X_train_top_n, y_train_resampled)

# Get the best model parameters
best_rf_model = grid_search_rf.best_estimator_

# Predict on training and test data
y_train_pred_rf = best_rf_model.predict(X_train_top_n)
y_test_pred_rf = best_rf_model.predict(X_test_top_n)

# Evaluate performance
print("Random Forest Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_rf))
print("Random Forest Test Accuracy:", accuracy_score(y_test, y_test_pred_rf))
print("\nRandom Forest Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_rf))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_rf))

# Output the chosen hyperparameters
print("Best hyperparameters for Random Forest:", grid_search_rf.best_params_)


#### 5. MLP Neural Network

In [None]:
# Define the MLP parameter grid
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (30, 30), (30, 15), (50, 25)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'momentum': [0.9, 0.95],
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.001, 0.0005, 0.0001]
}

# Create an MLP model with early stopping
mlp_model = MLPClassifier(max_iter=1500, random_state=42, early_stopping=True)

stratified_kfold = StratifiedKFold(n_splits=5)

# Perform grid search with SKF
grid_search_mlp = GridSearchCV(mlp_model, param_grid_mlp, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
grid_search_mlp.fit(X_train_top_n, y_train_resampled)

# Get the best model parameters
best_mlp_model = grid_search_mlp.best_estimator_

# Predict on training and test data
y_train_pred_mlp = best_mlp_model.predict(X_train_top_n)
y_test_pred_mlp = best_mlp_model.predict(X_test_top_n)

# Evaluate performance
print("MLP Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_mlp))
print("MLP Test Accuracy:", accuracy_score(y_test, y_test_pred_mlp))
print("\nMLP Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_mlp))
print("\nMLP Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_mlp))

# Output the chosen hyperparameters
print("Best hyperparameters for MLP:", grid_search_mlp.best_params_)


#### 6. SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Defining the SVM parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  
    'kernel': ['linear', 'rbf'],  
    'gamma': ['scale', 'auto']  
}

# Creating an SVM model
svm_model = SVC(probability=True, random_state=42)

# Defining Stratified K-Fold cross-validator
stratified_kfold = StratifiedKFold(n_splits=10)

# Performing grid search with SKF
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=stratified_kfold, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train_top_n, y_train_resampled)

# Get the best model parameters
best_svm_model = grid_search_svm.best_estimator_

# Predict on training and test data
y_train_pred_svm = best_svm_model.predict(X_train_top_n)
y_test_pred_svm = best_svm_model.predict(X_test_top_n)

# Evaluate performance
print("SVM Training Accuracy:", accuracy_score(y_train_resampled, y_train_pred_svm))
print("SVM Test Accuracy:", accuracy_score(y_test, y_test_pred_svm))
print("\nSVM Classification Report on Test Data:\n", classification_report(y_test, y_test_pred_svm))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_svm))

# Output the chosen hyperparameters
print("Best hyperparameters for SVM:", grid_search_svm.best_params_)


### Model Comparison

In [None]:
models = ['KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'MLP', 'SVM']
train_accuracies = [
    accuracy_score(y_train_resampled, y_train_pred_knn),
    accuracy_score(y_train_resampled, y_train_pred_nb),
    accuracy_score(y_train_resampled, y_train_pred_dt),
    accuracy_score(y_train_resampled, y_train_pred_rf),
    accuracy_score(y_train_resampled, y_train_pred_mlp),
    accuracy_score(y_train_resampled, y_train_pred_svm)  
]
test_accuracies = [
    accuracy_score(y_test, y_test_pred_knn),
    accuracy_score(y_test, y_test_pred_nb),
    accuracy_score(y_test, y_test_pred_dt),
    accuracy_score(y_test, y_test_pred_rf),
    accuracy_score(y_test, y_test_pred_mlp),
    accuracy_score(y_test, y_test_pred_svm)  
]

# Plot train vs test accuracies for each model
x = range(len(models))

plt.figure(figsize=(10, 6))  
plt.bar(x, train_accuracies, width=0.4, label='Train Accuracy', align='center')
plt.bar(x, test_accuracies, width=0.4, label='Test Accuracy', align='edge')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.xticks(x, models, rotation=30, ha='right')  
plt.legend()
plt.title('Train vs Test Accuracy for Each Model')
plt.tight_layout()  
plt.show()

In [None]:
models = ['KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'MLP', 'SVM']
y_preds = [y_test_pred_knn, y_test_pred_nb, y_test_pred_dt, y_test_pred_rf, y_test_pred_mlp, y_test_pred_svm]

# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes = axes.flatten()

for i, y_pred in enumerate(y_preds):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar=False)
    axes[i].set_title(models[i])
    axes[i].set_xlabel('Predicted Label')
    axes[i].set_ylabel('True Label')

plt.tight_layout() 
plt.show()


In [None]:
models = ['KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'MLP', 'SVM']

knn_report = classification_report(y_test, y_test_pred_knn, output_dict=True)
nb_report = classification_report(y_test, y_test_pred_nb, output_dict=True)
dt_report = classification_report(y_test, y_test_pred_dt, output_dict=True)
rf_report = classification_report(y_test, y_test_pred_rf, output_dict=True)
mlp_report = classification_report(y_test, y_test_pred_mlp, output_dict=True)
svm_report = classification_report(y_test, y_test_pred_svm, output_dict=True)

f1_scores = [
    knn_report['macro avg']['f1-score'],
    nb_report['macro avg']['f1-score'],
    dt_report['macro avg']['f1-score'],
    rf_report['macro avg']['f1-score'],
    mlp_report['macro avg']['f1-score'],
    svm_report['macro avg']['f1-score']
]

precision_scores = [
    knn_report['macro avg']['precision'],
    nb_report['macro avg']['precision'],
    dt_report['macro avg']['precision'],
    rf_report['macro avg']['precision'],
    mlp_report['macro avg']['precision'],
    svm_report['macro avg']['precision']
]

recall_scores = [
    knn_report['macro avg']['recall'],
    nb_report['macro avg']['recall'],
    dt_report['macro avg']['recall'],
    rf_report['macro avg']['recall'],
    mlp_report['macro avg']['recall'],
    svm_report['macro avg']['recall']
]


x = np.arange(len(models))
width = 0.25

plt.figure(figsize=(10, 6))  
plt.bar(x - width, f1_scores, width, label='F1 Score')
plt.bar(x, precision_scores, width, label='Precision')
plt.bar(x + width, recall_scores, width, label='Recall')

plt.xticks(x, models, rotation=30, ha='right') 
plt.xlabel('Models')
plt.ylabel('Scores')
plt.legend()
plt.title('F1 Score, Precision, and Recall for Each Model')
plt.tight_layout()  
plt.show()


In [None]:
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])  

y_score_knn = best_knn_model.predict_proba(X_test_top_n)
y_score_nb = nb_model.predict_proba(X_test_top_n)
y_score_dt = best_dt_model.predict_proba(X_test_top_n)
y_score_rf = best_rf_model.predict_proba(X_test_top_n)
y_score_mlp = best_mlp_model.predict_proba(X_test_top_n)
y_score_svm = best_svm_model.predict_proba(X_test_top_n)

y_scores = [y_score_knn, y_score_nb, y_score_dt, y_score_rf, y_score_mlp, y_score_svm]
models = ['KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'MLP', 'SVM']

# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

axes = axes.flatten()

for i, y_score in enumerate(y_scores):
    fpr = {}
    tpr = {}
    roc_auc = {}
    
    for j in range(len(y_test_binarized[0])):  
        fpr[j], tpr[j], _ = roc_curve(y_test_binarized[:, j], y_score[:, j])
        roc_auc[j] = auc(fpr[j], tpr[j])

    for j in range(len(y_test_binarized[0])):
        axes[i].plot(fpr[j], tpr[j], label=f'Class {j} (AUC = {roc_auc[j]:.2f})')

    axes[i].plot([0, 1], [0, 1], 'k--')  
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title(f'ROC Curve for {models[i]}')
    axes[i].legend(loc="lower right")

plt.tight_layout() 
plt.show()


The best two models are: KNN and Neural Network

### Prediction on Unseen Dataset

In [None]:
# Select the top 12 features from df_unseen_scaled
top_n_indices = np.argsort(feature_importances)[::-1][:n_features_to_select]
df_unseen_top_n = df_unseen_scaled[:, top_n_indices]

# Get the names of the top 12 features
top_n_feature_names = feature_names[top_n_indices]

# Print the top 12 feature names
print("Top 12 features selected from df_unseen:")
for feature in top_n_feature_names:
    print(feature)

# Create a DataFrame for df_unseen_top_n with the top 12 feature names
df_unseen_top_n_df = pd.DataFrame(df_unseen_top_n, columns=top_n_feature_names)

# Display the first few rows of the DataFrame to inspect the values
print("Top 12 features in df_unseen (first 5 rows):")
print(df_unseen_top_n_df.head())


In [None]:
# Predict class labels for df_unseen using the finalized KNN and MLP models 

# Predict using the KNN model with the top 12 features
y_unseen_pred_knn = best_knn_model.predict(df_unseen_top_n)

# Predict using the Neural Network model with the top 12 features
y_unseen_pred_mlp = best_mlp_model.predict(df_unseen_top_n)

In [None]:
# Create the output DataFrame with index, KNN predictions, and Neural Network predictions
output_df = pd.DataFrame({
    'index': df_unseen['index'],  
    'KNN': y_unseen_pred_knn,
    'NeuralNetwork': y_unseen_pred_mlp
})

In [None]:
# Output to Sqlite file
output_cnx = db.connect('Answers.sqlite')

output_df.to_sql('predictions', output_cnx, if_exists='replace', index=False)

output_cnx.close()

print("Predictions for KNN and Neural Network saved to 'Answers.sqlite' successfully!")

In [None]:
## Taking a look at the output_df
output_df

In [None]:
## Analysing model disagreement

# Filter rows where KNN is not equal to NeuralNetwork
difference_df = output_df[output_df['KNN'] != output_df['NeuralNetwork']]

# Count of differences
count_differences = difference_df.shape[0]

# Total number of rows
total_count = output_df.shape[0]

# Percentage of differences
percentage_difference = (count_differences / total_count) * 100 if total_count > 0 else 0

# Display the results
print("Count of differences:", count_differences)
print("Percentage of differences: {:.2f}%".format(percentage_difference))

Therefore, there is a possibility that the classifiers are giving a wrong prediction 10.40% of the time, where as they are correct 89.6%

KNN Accuracy from grid search: 0.89 == 89%
Neural Network Accuracy from grid search: 0.904 == 90.4%