# Part III. Classification on MNIST Data

In [30]:
import pandas as pd
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score

def CVError(model,X,Y,cv=5):
    return 1-cross_val_score(model,X,Y,cv=cv).mean()

class Timer:
    def __init__(self,):
        self.time = None

    def __enter__(self):
        self.time = time.time()

    def __exit__(self,*args,**kwargs):
        used = time.time() - self.time
        print(f"Used time:{used}s")

timer = Timer()

In [31]:
traindata = pd.read_csv('../data/MNIST/train_resized.csv')
testdata = pd.read_csv('../data/MNIST/test_resized.csv')

## 1. Use only the digit images of 3 and 6 from train_resized.csv and test_resized.csv to build an SVM 
classifier for binary classification. More specifically, use a linear kernel and choose the best cost 
(the data size is large so a large cost value is suitable) parameter (called budget in our course) by 5 
fold cross validation. Apply your model on the test data and report the misclassification error, 
confusion matrix. Also report the time cost of training your model.

In [56]:
traindata36 = traindata[traindata.label.isin((3,6))]
testdata36 = testdata[testdata.label.isin((3,6))]
scaler = StandardScaler()
Xtrain = scaler.fit_transform(traindata36.loc[:,"pixel1":])
Xtest = scaler.transform(testdata36.loc[:,"pixel1":])

In [57]:
with timer:
    params = {'C':np.logspace(-4,4,num=9)}
    svclin = GridSearchCV(SVC(kernel='linear',degree=1),param_grid=params,cv=5)
    svclin.fit(Xtrain,traindata36.label)
    result_df = pd.DataFrame({'params':svclin.cv_results_['params'],'5-fold CV Error':svclin.cv_results_['mean_test_score']})
result_df

Used time:5.108419895172119s


Unnamed: 0,params,5-fold CV Error
0,{'C': 0.0001},0.989877
1,{'C': 0.001},0.994689
2,{'C': 0.01},0.995187
3,{'C': 0.1},0.994192
4,{'C': 1.0},0.992698
5,{'C': 10.0},0.992864
6,{'C': 100.0},0.992864
7,{'C': 1000.0},0.992864
8,{'C': 10000.0},0.992864


In [58]:
svclin_best = svclin.best_estimator_
svclin_best

In [35]:
svclin_best = svclin.best_estimator_
pred = svclin_best.predict(Xtest)
print("misclassifcation error:",1-accuracy_score(pred,testdata36.label))
print("confusion matrix:\n",confusion_matrix(svclin_best.predict(Xtest),testdata36.label))

misclassifcation error: 0.006904955320877315
confusion matrix:
 [[1251    6]
 [  11 1194]]


## 2. Use only the digit images of 3 and 6 from train_resized.csv and test_resized.csv to build an SVM 
classifier for binary classification. More specifically, use a radial kernel and choose the best cost 
parameter, gamma parameter by 5 fold cross validation. Apply your model on the test data and 
report the misclassification error, confusion matrix. Also report the time cost of training your 
model. 

In [59]:
with timer:
    params = {'C': (0.1, 1,5,10,15,20,100),'gamma': (0.001,0.007,0.01)}
    svcrbf = GridSearchCV(SVC(kernel='rbf'), param_grid=params,
                                cv=5, n_jobs=-1)
    svcrbf.fit(Xtrain, traindata36.label)
    result_df = pd.DataFrame({'params': svcrbf.cv_results_[
                             'params'], '5-fold CV Error': svcrbf.cv_results_['mean_test_score']})
result_df

Used time:20.494226694107056s


Unnamed: 0,params,5-fold CV Error
0,"{'C': 0.1, 'gamma': 0.001}",0.990374
1,"{'C': 0.1, 'gamma': 0.007}",0.983239
2,"{'C': 0.1, 'gamma': 0.01}",0.973116
3,"{'C': 1, 'gamma': 0.001}",0.995353
4,"{'C': 1, 'gamma': 0.007}",0.992532
5,"{'C': 1, 'gamma': 0.01}",0.990541
6,"{'C': 5, 'gamma': 0.001}",0.996017
7,"{'C': 5, 'gamma': 0.007}",0.99303
8,"{'C': 5, 'gamma': 0.01}",0.990872
9,"{'C': 10, 'gamma': 0.001}",0.996349


In [60]:
svcrbf_best = svcrbf.best_estimator_
svcrbf_best

In [41]:
pred = svcrbf_best.predict(Xtest)
print("misclassifcation error:",1-accuracy_score(pred,testdata36.label))
print("confusion matrix:\n",confusion_matrix(svcrbf_best.predict(Xtest),testdata36.label))

misclassifcation error: 0.0036555645816409577
confusion matrix:
 [[1257    4]
 [   5 1196]]


## 4. Use only the digit images of 1,2,5 and 8 from train_resized.csv and test_resized.csv to build an 
SVM classifier for multi-class classification. More specifically, use a linear kernel and choose the 
best cost parameter (called budget in our course) by 5 fold cross validation. Apply your model on 
the test data and report the misclassification error, confusion matrix. Also report the time cost of 
training your model.

In [46]:
traindata1258 = traindata[traindata.label.isin((1,2,5,8))]
testdata1258 = testdata[testdata.label.isin((1,2,5,8))]
scaler = StandardScaler()
Xtrain = scaler.fit_transform(traindata1258.loc[:,"pixel1":])
Xtest = scaler.transform(testdata1258.loc[:,"pixel1":])

In [47]:
with timer:
    params = {'C':(0.001, 0.1 ,0.5,1 ,5,10 ,15,100)}
    svclin = GridSearchCV(SVC(kernel='linear',degree=1),param_grid=params,cv=5,n_jobs=-1)
    svclin.fit(Xtrain,traindata1258.label)
    result_df = pd.DataFrame({'params':svclin.cv_results_['params'],'5-fold CV Error':svclin.cv_results_['mean_test_score']})
result_df

Used time:240.56118321418762s


Unnamed: 0,params,5-fold CV Error
0,{'C': 0.001},0.952154
1,{'C': 0.1},0.96038
2,{'C': 0.5},0.958785
3,{'C': 1},0.956435
4,{'C': 5},0.953077
5,{'C': 10},0.953161
6,{'C': 15},0.952069
7,{'C': 100},0.949803


In [55]:
svclin_best = svclin.best_estimator_
svclin_best

In [48]:
pred = svclin_best.predict(Xtest)
print("misclassifcation error:",1-accuracy_score(pred,testdata1258.label))
print("confusion matrix:\n",confusion_matrix(svclin_best.predict(Xtest),testdata1258.label))

misclassifcation error: 0.046608406158968
confusion matrix:
 [[1344   10   16   23]
 [  11 1135   15   22]
 [   2   16 1062   46]
 [   6   24   33 1041]]


## 5. Use the complete dataset of train_resized.csv and test_resized.csv to build an SVM classifier for 
classifying all 10 classes. You can use any SVM model and tune the parameters by yourself. 
Report the best test performance (misclassification error) you can get, the model you used and the 
time cost of training your model.

In [49]:
scaler = StandardScaler()
Xtrain = scaler.fit_transform(traindata.loc[:,"pixel1":])
Xtest = scaler.transform(testdata.loc[:,"pixel1":])

In [51]:
with timer:
    params = {'C': (10,20),'gamma': (0.001,0.007,0.01)}
    svcrbf = GridSearchCV(SVC(kernel='rbf'), param_grid=params,
                                cv=5, n_jobs=-1)
    svcrbf.fit(Xtrain, traindata.label)
    result_df = pd.DataFrame({'params': svcrbf.cv_results_[
                             'params'], '5-fold CV Error': svcrbf.cv_results_['mean_test_score']})
result_df

Used time:402.82375860214233s


Unnamed: 0,params,5-fold CV Error
0,"{'C': 10, 'gamma': 0.001}",0.953
1,"{'C': 10, 'gamma': 0.007}",0.9648
2,"{'C': 10, 'gamma': 0.01}",0.9625
3,"{'C': 20, 'gamma': 0.001}",0.956367
4,"{'C': 20, 'gamma': 0.007}",0.965133
5,"{'C': 20, 'gamma': 0.01}",0.962433


In [52]:
svcrbf_best = svcrbf.best_estimator_
svcrbf_best

In [54]:
pred = svcrbf_best.predict(Xtest)
print("misclassifcation error:",1-accuracy_score(pred,testdata.label))
print("confusion matrix:\n",confusion_matrix(svcrbf_best.predict(Xtest),testdata.label))

misclassifcation error: 0.032749999999999946
confusion matrix:
 [[1125    1    3    0    2    4    3    1    3    3]
 [   0 1343    1    2    2    2    0    1    3    2]
 [   3   13 1150   29   13    7   12   12   10    8]
 [   1    1    6 1206    0   16    0    2   13    4]
 [   0    1    7    0 1141    1    3    7    5   15]
 [   3    0    3    9    3 1069    7    1    8    2]
 [   7    0    1    1    3   11 1172    0    1    0]
 [   0    1    7    5    2    1    0 1224    5   15]
 [   1    2    5    3    0   12    3    0 1077    4]
 [   0    1    2    7    9    3    0   16    7 1100]]
