# Implementation of Machine Learning Models on EMNIST Dataset

Importing Libraries

In [1]:
# General imports
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Models
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.utils import to_categorical




### Load and Preprocess the EMNIST Dataset

In [None]:
# Load the CSV files
train_data = pd.read_csv('emnist-digits-train.csv')
test_data = pd.read_csv('emnist-digits-test.csv')

# Separate features and labels
X_train = train_data.iloc[:, 1:].values  # All columns except the first are features
y_train = train_data.iloc[:, 0].values   # First column is the label
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

# Normalize pixel values
X_train = X_train / 255.0
X_test = X_test / 255.0

# Reshape for CNN (if needed)
X_train_cnn = X_train.reshape(-1, 28, 28, 1)
X_test_cnn = X_test.reshape(-1, 28, 28, 1)

# One-hot encoding for CNN
y_train_cnn = to_categorical(y_train)
y_test_cnn = to_categorical(y_test)

print("Data preprocessing completed!")


In [None]:
# Print dataset information
print("Size of the dataset:", len(X_train) + len(X_test))
print("Number of elements in the training set:", len(X_train))
print("Number of elements in the test set:", len(X_test))

# Visualize a sample image
def plot_sample(index, data_set='train'):
    if data_set == 'train':
        sample = X_train[index]
        label = y_train[index]
    else:
        sample = X_test[index]
        label = "Unknown"  # No label in test set
    
    sample_image = sample.reshape(28, 28)
    plt.imshow(sample_image, cmap="binary")
    plt.axis("off")
    plt.title(f"Label: {label}")
    plt.show()

# Plot sample image (e.g., index 810 from training set)
plot_sample(810)


# 1. K-Nearest Neighbors (K-NN)

In [None]:
# Implement K-NN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train.reshape(len(X_train), -1), y_train)
y_pred_knn = knn.predict(X_test.reshape(len(X_test), -1))

# Evaluate classification
print(classification_report(y_test, y_pred_knn))

In [None]:
#Evaluate Confusion Matrix
knnaccuracy = knn.score(X_test, y_test)
print(f"Test accuracy: {knnaccuracy:.4f}")
cm = confusion_matrix(y_test, y_pred_knn)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix(KNN)")
plt.show()

### 2. Support Vector Machine (SVM)

In [None]:
# Implement SVM
svm = SVC(C=1.0, kernel='linear')
svm.fit(X_train.reshape(len(X_train), -1), y_train)
y_pred_svm = svm.predict(X_test.reshape(len(X_test), -1))

# Evaluate Classification
print(classification_report(y_test, y_pred_svm))

In [None]:
#Evaluate Confusion Matrix
svmaccuracy = accuracy_score(y_test, y_pred_svm)
print(f"Test accuracy: {svmaccuracy:.4f}")

cm = confusion_matrix(y_test, y_pred_svm)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix(SVM)")
plt.show()

### 3. Logistic Regression

In [None]:
# Implement Logistic Regression
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train.reshape(len(X_train), -1), y_train)
y_pred_log_reg = log_reg.predict(X_test.reshape(len(X_test), -1))

# Evaluate Classification
print(classification_report(y_test, y_pred_log_reg))

In [None]:
#Evaluate Confusion Matrix
logregaccuracy = logreg.score(X_test, y_test)
print(f"Logistic Regression Test Accuracy: {logregaccuracy:.4f}")

cm_logreg = confusion_matrix(y_test, y_pred_log_reg)

plt.figure(figsize=(10, 8))
sns.heatmap(cm_logreg, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix (Logistic Regression)")
plt.show()

### 4. Decision Tree

In [None]:
# Implement Decision Tree
decision_tree = DecisionTreeClassifier(max_depth=10)
decision_tree.fit(X_train.reshape(len(X_train), -1), y_train)
y_pred_tree = decision_tree.predict(X_test.reshape(len(X_test), -1))

# Evaluate
print(classification_report(y_test, y_pred_tree))

In [None]:
# Evaluate Confusion Matrix
dtaccuracy = dt_classifier.score(x_test, y_pred_tree)
print(f"Test accuracy: {dtaccuracy:.4f}")

cm = confusion_matrix(y_test, y_pred_tree)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix for Decision Tree")
plt.show()

### 5. Convolutional Neural Network (CNN)

In [None]:
# Implement CNN
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=128, validation_data=(X_test_cnn, y_test_cnn))

# Evaluate
loss, cnnaccuracy = cnn_model.evaluate(X_test_cnn, y_test_cnn, verbose = 0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {cnnaccuracy:.4f}")

y_pred = np.argmax(cnn_model.predict(x_test), axis=1)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix(CNN)")
plt.show()

In [None]:
# Collecting all accuracies for comparison
evaluation_metrics = {
    "KNN": knnaccuracy,
    "SVM": svmaccuracy,
    "Logistic Regression": logregaccuracy,
    "Decision Tree": dtaccuracy,
    "CNN": cnnaccuracy,
}

# Creating a bar chart
plt.figure(figsize=(10, 6))
models = list(evaluation_metrics.keys())
accuracies = list(evaluation_metrics.values())

# Bar chart
plt.bar(models, accuracies, color=['blue', 'orange', 'green', 'red', 'purple'])

# Adding chart details
plt.title("Model Evaluation Metrics", fontsize=16)
plt.xlabel("Models", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
plt.ylim(0, 1)  # Accuracy is between 0 and 1
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f"{v:.4f}", ha='center', fontsize=12)

# Display chart
plt.show()
