In [1]:
import pandas as pd
import numpy as np

# load the data from the csv file
df_digits = pd.read_csv('mnist.csv')

X = df_digits[df_digits.columns[:-1]].to_numpy()
y = df_digits[df_digits.columns[-1]].to_numpy()

n = 60000
#Use first 60,000 images as the training set, and the last 10,000 images as the test set.
X_train, X_test = X[:n], X[n:] 
y_train, y_test = y[:n], y[n:]

In [2]:
#Support Vector Classifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

svm_model = SVC(kernel = 'rbf', gamma = 'scale', degree = 4)
#kernel:Specifies the kernel type to be used in the algorithm.
#gamma: Kernel coefficient for ‘rbf’.
#degree: 4: Degree of the polynomial kernel function.

# The 10-fold cross-validation accuracies on the training set, and take avg.
svm_score = cross_val_score(svm_model, X_train, y_train, cv=10, scoring="accuracy").mean()

print('SVM Accuracy Score on the training set:')
print (svm_score)

SVM Accuracy Score on the training set:
0.9779666666666665


In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train the model on the training set.
svm_model.fit(X_train, y_train)

#Precit using the test set.
svm_y_pred = svm_model.predict(X_test)

#The accuracies on the test set.
svm_score = accuracy_score(y_test, svm_y_pred)

print('SVM Accuracy Score on the test set:')
print (svm_score)

SVM Accuracy Score on the test set:
0.9792


In [3]:
#KNeighborsClassifier: Classifier implementing the k-nearest neighbors vote
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

knn_params = {
    'n_neighbors': randint(1, 15),    #Number of neighbors to use
    'weights':['uniform', 'distance'] #Weight function used in prediction:
                                      #‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
                                      #‘distance’ : weight points by their distance. in this case,
                                      #closer neighbors of a query point will have a greater influence than neighbors which
                                      #are further away.
}

#Apply random search to find best parameters.
knn_random_search = RandomizedSearchCV(
    KNeighborsClassifier(), knn_params, n_iter=10, cv=10, scoring='accuracy', random_state=42
)
knn_random_search.fit(X_train, y_train)

# The best parameters found for the knn model:
print("Best parameters for KNN:", knn_random_search.best_params_)
# The 10-fold cross-validation accuracies on the training set
print("Best cross-validation score:", knn_random_search.best_score_)

Best parameters for KNN: {'n_neighbors': 4, 'weights': 'distance'}
Best cross-validation score: 0.9728666666666668


In [8]:
from sklearn.metrics import accuracy_score

knn_model = KNeighborsClassifier(n_neighbors=4, weights='distance')

# Train the model on the training set
knn_model.fit(X_train, y_train)
#Precit using the test set.
knn_y_pred = knn_model.predict(X_test)
#The accuracies on the test set
knn_score = accuracy_score(y_test, knn_y_pred)

print('KNeighbours Accuracy Score on the test set:')
print (knn_score)

KNeighbours Accuracy Score on the test set:
0.9714


In [4]:
# load the class data
import numpy as np
import pandas as pd

df_digits_class = pd.read_csv('class_digits')

# Convert the features and labels to NumPy arrays
X_class = df_digits_class[df_digits_class.columns[:-1]].to_numpy()
y_class = df_digits_class[df_digits_class.columns[-1]].to_numpy()

In [13]:
# Train both models on the whole data (on all 70000 images)
svm_model.fit(X, y)
knn_model.fit(X, y)

#Predict using the class data set.
svm_y_pred_class = svm_model.predict(X_class)
knn_y_pred_class = knn_model.predict(X_class)

#The accuracies on the class set
svm_score_class = accuracy_score(y_class, svm_y_pred_class)
knn_score_class = accuracy_score(y_class, knn_y_pred_class)

print('SVM Accuracy Score on the class set:')
print (svm_score_class)

print('KNN Accuracy Score on the class set:')
print (knn_score_class)

SVM Accuracy Score on the class set:
0.3904761904761905
KNN Accuracy Score on the class set:
0.35714285714285715
