# Part III. Classification on MNIST Data

In [37]:
import pandas as pd
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score

def CVError(model,X,Y,cv=5):
    return 1-cross_val_score(model,X,Y,cv=cv).mean()

class Timer:
    def __init__(self,):
        self.time = None

    def __enter__(self):
        self.time = time.time()

    def __exit__(self,*args,**kwargs):
        used = time.time() - self.time
        print(f"Used time:{used}s")

timer = Timer()

In [4]:
traindata = pd.read_csv('../data/MNIST/train_resized.csv')
testdata = pd.read_csv('../data/MNIST/test_resized.csv')

## 1. Use only the digit images of 3 and 6 from train_resized.csv and test_resized.csv to build an SVM 
classifier for binary classification. More specifically, use a linear kernel and choose the best cost 
(the data size is large so a large cost value is suitable) parameter (called budget in our course) by 5 
fold cross validation. Apply your model on the test data and report the misclassification error, 
confusion matrix. Also report the time cost of training your model.

In [10]:
traindata36 = traindata[traindata.label.isin((3,6))]
testdata36 = testdata[testdata.label.isin((3,6))]

In [39]:
with timer:
    params = {'C':np.logspace(-4,4,num=9)}
    svc = GridSearchCV(SVC(kernel='linear',degree=1),param_grid=params,cv=5)
    svc.fit(traindata36.loc[:,"pixel1":],traindata36.label)
    result_df = pd.DataFrame({'C':svc.cv_results_['params'],'5-fold CV Error':svc.cv_results_['mean_test_score']})
result_df

Used time:7.691442012786865s


Unnamed: 0,C,5-fold CV Error
0,{'C': 0.0001},0.99469
1,{'C': 0.001},0.992532
2,{'C': 0.01},0.992034
3,{'C': 0.1},0.992034
4,{'C': 1.0},0.992034
5,{'C': 10.0},0.992034
6,{'C': 100.0},0.992034
7,{'C': 1000.0},0.992034
8,{'C': 10000.0},0.992034


In [41]:
svc_best = svc.best_estimator_
pred = svc_best.predict(testdata36.loc[:,"pixel1":])
print("misclassifcation error:",1-accuracy_score(pred,testdata36.label))
print("confusion matrix:\n",confusion_matrix(svc_best.predict(testdata36.loc[:,"pixel1":]),testdata36.label))

misclassifcation error: 0.006092607636068226
confusion matrix:
 [[1251    4]
 [  11 1196]]


## 2. Use only the digit images of 3 and 6 from train_resized.csv and test_resized.csv to build an SVM 
classifier for binary classification. More specifically, use a radial kernel and choose the best cost 
parameter, gamma parameter by 5 fold cross validation. Apply your model on the test data and 
report the misclassification error, confusion matrix. Also report the time cost of training your 
model. 

In [None]:
with timer:
    params = {'C':np.logspace(-4,4,num=9),'gamma':np.logspace(-4,0,num=5)}
    svc = GridSearchCV(SVC(kernel='rbf'),param_grid=params,cv=5)
    svc.fit(traindata36.loc[:,"pixel1":],traindata36.label)
    result_df = pd.DataFrame({'params':svc.cv_results_['params'],'5-fold CV Error':svc.cv_results_['mean_test_score']})
result_df

In [None]:
svc_best = svc.best_estimator_
pred = svc_best.predict(testdata36.loc[:,"pixel1":])
print("misclassifcation error:",1-accuracy_score(pred,testdata36.label))
print("confusion matrix:\n",confusion_matrix(svc_best.predict(testdata36.loc[:,"pixel1":]),testdata36.label))