In [1]:
# Required Libraries
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
# Required Libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load the dataset
mnist_df = pd.read_csv('mnist_dataset.csv')


In [3]:
# The first column is the label (response), and the rest are pixel values
labels = mnist_df.iloc[:, 0]    # First column (digit labels)
pixels = mnist_df.iloc[:, 1:]   # Next 784 columns (grayscale pixel values)


In [4]:
# Filter columns that have at least 30% non-zero values
threshold_value = 0.30  # 30% non-zero threshold
non_zero_threshold = int(len(pixels) * threshold_value)
# Identify columns where non-zero values appear in at least 30% of rows
valid_columns = [col for col in pixels.columns if (pixels[col] != 0).sum() >= non_zero_threshold]
# Randomly select 50 of these valid columns
selected_columns = random.sample(valid_columns, 50)
# Reduce the dataset to the selected columns
selected_pixels = pixels[selected_columns]

In [5]:
# Create a new DataFrame with the selected columns and the labels
selected_data = pd.concat([labels, selected_pixels], axis=1)

In [6]:
# Summary for the new dataframe
selected_data.describe()

Unnamed: 0,5,27,9,0.368,0.299,0.300,0.253,0.433,0.387,0.252,...,70,0.227,0.251,154,78,0.229,0.189,148,253.38,244
count,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,...,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0,59999.0
mean,4.453924,78.380123,107.187503,59.608327,108.882148,81.158986,105.511192,50.801097,80.738046,107.484375,...,89.339022,93.870148,97.294422,70.911432,83.876481,106.282955,77.319322,101.40554,118.214004,92.029667
std,2.889294,104.162611,111.296858,96.825931,111.550644,104.694352,110.418729,91.469712,105.462353,109.932493,...,108.905771,107.409692,107.95271,101.478548,106.532087,110.46534,104.599403,110.313137,111.237785,109.149522
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,60.0,0.0,64.0,0.0,56.0,0.0,0.0,64.0,...,1.0,25.0,35.0,0.0,0.0,59.0,0.0,39.0,103.0,0.0
75%,7.0,189.5,249.0,107.0,251.0,195.0,245.0,50.0,197.0,245.0,...,227.0,224.0,231.0,159.0,209.0,246.0,189.0,243.0,252.0,230.0
max,9.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [7]:
# Split the dataset into training, validation, and test sets
train_size = 0.70  # 70% for training
val_size = 0.15    # 15% for validation
test_size = 0.15   # 15% for testing

In [8]:
# First split to get the training and remaining data
train_data, remaining_data = train_test_split(selected_data, train_size=train_size, random_state=42, stratify=labels)


In [9]:
# Second split to get validation and test data
val_data, test_data = train_test_split(remaining_data, test_size=test_size/(test_size + val_size), random_state=42, stratify=remaining_data.iloc[:, 0])


In [10]:
# Display the shape of each set to confirm the splits
print(f"Training set size: {train_data.shape}")
print(f"Validation set size: {val_data.shape}")
print(f"Test set size: {test_data.shape}")

Training set size: (41999, 51)
Validation set size: (9000, 51)
Test set size: (9000, 51)


In [11]:
# Separate the features and labels for training, validation, and test sets
X_train = train_data.iloc[:, 1:]  # Features (pixel values)
y_train = train_data.iloc[:, 0]   # Labels (digits)

X_val = val_data.iloc[:, 1:]      # Validation features
y_val = val_data.iloc[:, 0]       # Validation labels

X_test = test_data.iloc[:, 1:]    # Test features
y_test = test_data.iloc[:, 0]     # Test labels

In [12]:
# Baseline KNN Model (without hyperparameter tuning)
knn = KNeighborsClassifier(n_neighbors=3)  # K = 3 for starters
knn.fit(X_train, y_train)

# Predict on the validation set
val_pred = knn.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred)

# Display accuracy on the validation set
print(f"Validation Accuracy with K=3: {val_accuracy:.4f}")
# Generate classification report for validation set for baseline  model
classification_report_val = classification_report(y_val, val_pred)
print(classification_report(y_val, val_pred))

Validation Accuracy with K=3: 0.9237
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       889
           1       0.92      0.99      0.95      1012
           2       0.93      0.89      0.91       894
           3       0.91      0.92      0.91       919
           4       0.91      0.92      0.92       876
           5       0.94      0.93      0.94       813
           6       0.93      0.96      0.95       887
           7       0.90      0.94      0.92       940
           8       0.95      0.83      0.89       878
           9       0.90      0.87      0.88       892

    accuracy                           0.92      9000
   macro avg       0.92      0.92      0.92      9000
weighted avg       0.92      0.92      0.92      9000



In [13]:
#Test the KNN model on the test set
test_pred = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)
# Display the test set accuracy and classification report
print(f"Test Accuracy with Baseline KNN: {test_accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, test_pred))

Test Accuracy with Baseline KNN: 0.9278

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       888
           1       0.92      0.99      0.96      1011
           2       0.91      0.91      0.91       893
           3       0.91      0.92      0.92       920
           4       0.92      0.93      0.92       877
           5       0.93      0.91      0.92       813
           6       0.95      0.97      0.96       888
           7       0.92      0.94      0.93       940
           8       0.96      0.84      0.90       877
           9       0.90      0.87      0.89       893

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000



In [14]:
# KNN Model with Hyperparameter Tuning (using GridSearchCV)
param_grid = {
    'n_neighbors': [3, 5, 7],  # Values for K
    'weights': ['uniform', 'distance'],  # Weighting function
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Different distance metrics
}

# GridSearch to find the best parameters
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}


In [15]:
# Validate the best model
best_knn = grid_search.best_estimator_
val_pred_best = best_knn.predict(X_val)
val_accuracy_best = accuracy_score(y_val, val_pred_best)

# Display the accuracy of the best model on the validation set
print(f"Validation Accuracy with Best KNN: {val_accuracy_best:.4f}")
# Generate classification report for validation set
print("KNN Classification Report (Validation Set):")
print(classification_report(y_val, val_pred_best))

Validation Accuracy with Best KNN: 0.9271
KNN Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       889
           1       0.93      0.99      0.96      1012
           2       0.95      0.89      0.92       894
           3       0.92      0.91      0.91       919
           4       0.92      0.92      0.92       876
           5       0.94      0.94      0.94       813
           6       0.93      0.97      0.95       887
           7       0.90      0.94      0.92       940
           8       0.94      0.85      0.89       878
           9       0.89      0.88      0.88       892

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000



In [16]:
#Test the best model on the test set
test_pred_best = best_knn.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred_best)

# Display the test set accuracy and classification report
print(f"Test Accuracy with Best KNN: {test_accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, test_pred_best))

Test Accuracy with Best KNN: 0.9301

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97       888
           1       0.92      0.99      0.96      1011
           2       0.94      0.91      0.92       893
           3       0.93      0.92      0.92       920
           4       0.93      0.92      0.93       877
           5       0.93      0.91      0.92       813
           6       0.95      0.98      0.96       888
           7       0.91      0.94      0.92       940
           8       0.96      0.85      0.90       877
           9       0.89      0.88      0.89       893

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000



In [17]:
# Baseline Neural Network Model
mlp = MLPClassifier(random_state=42, max_iter=200)
mlp.fit(X_train, y_train)

# Predict on the validation set
val_pred_NN = mlp.predict(X_val)
val_accuracy = accuracy_score(y_val, val_pred_NN)

# Display accuracy on the validation set
print(f"Baseline Validation Accuracy: {val_accuracy:.4f}")
# Generate and print classification report for the validation set
print("Classification Report of Neural Network Model (Validation Set):")
print(classification_report(y_val, val_pred_NN))

Baseline Validation Accuracy: 0.9257
Classification Report of Neural Network Model (Validation Set):
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       889
           1       0.96      0.98      0.97      1012
           2       0.90      0.89      0.90       894
           3       0.92      0.92      0.92       919
           4       0.93      0.90      0.92       876
           5       0.94      0.92      0.93       813
           6       0.95      0.94      0.95       887
           7       0.92      0.92      0.92       940
           8       0.88      0.91      0.90       878
           9       0.88      0.89      0.89       892

    accuracy                           0.93      9000
   macro avg       0.93      0.92      0.93      9000
weighted avg       0.93      0.93      0.93      9000



In [18]:
#Test the neural network model on the test set
test_pred_NN = mlp.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred_NN)
# Display the test set accuracy and classification report
print(f"Test Accuracy with Neural Network Model: {test_accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, test_pred_NN))

Test Accuracy with Neural Network Model: 0.9276

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       888
           1       0.97      0.98      0.98      1011
           2       0.93      0.90      0.91       893
           3       0.92      0.90      0.91       920
           4       0.94      0.90      0.92       877
           5       0.92      0.91      0.92       813
           6       0.95      0.95      0.95       888
           7       0.92      0.94      0.93       940
           8       0.88      0.93      0.90       877
           9       0.90      0.90      0.90       893

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000



In [19]:
# Neural Network Model with Hyperparameter Tuning (using GridSearchCV)
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  # Number of neurons in each layer
    'activation': ['relu', 'tanh'],  # Activation functions
    'solver': ['adam', 'sgd'],  # Solvers for weight optimization
    'alpha': [0.0001, 0.001],  # Regularization parameter
    'learning_rate': ['constant', 'adaptive']  # Learning rate
}

# GridSearch to find the best parameters
grid_search = GridSearchCV(MLPClassifier(random_state=42, max_iter=200), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Display the best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")







Best Hyperparameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'solver': 'adam'}


In [20]:
# Validate the best model
best_mlp = grid_search.best_estimator_
val_pred_best_NN = best_mlp.predict(X_val)
val_accuracy_best = accuracy_score(y_val, val_pred_best_NN)

# Display the accuracy of the best model on the validation set
print(f"Validation Accuracy with Best Neural Network: {val_accuracy_best:.4f}")
print("\nClassification Report:\n", classification_report(y_val, val_pred_best_NN))

Validation Accuracy with Best Neural Network: 0.9188

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       889
           1       0.95      0.98      0.96      1012
           2       0.92      0.88      0.90       894
           3       0.92      0.90      0.91       919
           4       0.90      0.93      0.92       876
           5       0.94      0.92      0.93       813
           6       0.95      0.93      0.94       887
           7       0.94      0.90      0.92       940
           8       0.87      0.89      0.88       878
           9       0.85      0.89      0.87       892

    accuracy                           0.92      9000
   macro avg       0.92      0.92      0.92      9000
weighted avg       0.92      0.92      0.92      9000



In [21]:
# Step 4: Test the best model on the test set
test_pred_best_NN = best_mlp.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred_best_NN)

# Display the test set accuracy and classification report
print(f"Test Accuracy with Best Neural Network: {test_accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, test_pred_best_NN))


Test Accuracy with Best Neural Network: 0.9277

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       888
           1       0.96      0.99      0.98      1011
           2       0.93      0.91      0.92       893
           3       0.92      0.91      0.91       920
           4       0.92      0.94      0.93       877
           5       0.93      0.89      0.91       813
           6       0.95      0.94      0.95       888
           7       0.95      0.92      0.93       940
           8       0.89      0.92      0.91       877
           9       0.88      0.88      0.88       893

    accuracy                           0.93      9000
   macro avg       0.93      0.93      0.93      9000
weighted avg       0.93      0.93      0.93      9000

