In [2]:
import h5py
import scipy as sp
import pandas as pd
import timeit
import matplotlib.pyplot as pl
from numpy import *
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [6]:
with h5py.File('images_training.h5','r') as H:
    data_train = copy(H['data'])
with h5py.File('labels_training.h5','r') as H:
    label_train = copy(H['label'])
with h5py.File('images_testing.h5 ','r') as H:
    data_test = copy(H['data'])
with h5py.File('labels_testing_2000.h5','r') as H:
    label_test = copy(H['label'])

In [None]:
def pcaProcess(n=2):
    #preprocess the data
    pca = PCA(n_components=n)
    data_PCA = empty(((30000,28,n)))
    data_PCA_test = empty(((5000,28,n)))
    #convert the test data by PCA
    for i in range(5000):
        pca.fit(data_test[i])
        data_PCA_test[i] = pca.transform(data_test[i])
    #convert the trainning data using PCA
    for i in range(30000):
        pca.fit(data_train[i])
        data_PCA[i] = pca.transform(data_train[i])

In [None]:
#reshape the trainning data
def load(data,label): #categories defalt by 1
    m = shape(data)[0]
    data_2D = data.reshape(m,-1)    
    return data_2D,label
#load the test sample
def loadTest(data):
    m = shape(data)[0]
    data_2D = data.reshape(m,-1)
    return data_2D
#load and scale the data
data_x, data_y = load(data_train,label_train)
data_t_x = loadTest(data_test[0:2000])
#scale the data
data_xs = scale(data_x)
data_t_xs = scale(data_t_x)

In [14]:
#polynomial kernel with scaled data
poly_C = [1.0,5.0,10.0,20.0,25.0,30.0,40.0,45.0,50.0,55.0,60.0,100.0]
print("Polynomial kernel: ")
for i in poly_C:
    start_time = timeit.default_timer()     
    #predict the result
    clf = SVC(C=i, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.0001, verbose=False)
    clf.fit(data_xs,data_y)
    socre = clf.score(data_t_xs,label_test)

    print("C = ",i,"  Accuracy: ",socre, "Total time :", timeit.default_timer() - start_time , 's')

Polynomial kernel: 
C =  1.0   Accuracy:  0.8635 Total time : 268.04132268499234 s
Polynomial kernel: 
C =  5.0   Accuracy:  0.8815 Total time : 207.3765171559935 s
Polynomial kernel: 
C =  10.0   Accuracy:  0.8815 Total time : 202.9561198619922 s
Polynomial kernel: 
C =  20.0   Accuracy:  0.8855 Total time : 197.463260094999 s
Polynomial kernel: 
C =  25.0   Accuracy:  0.8855 Total time : 199.11837420800293 s
Polynomial kernel: 
C =  30.0   Accuracy:  0.886 Total time : 197.06365760100016 s
Polynomial kernel: 
C =  40.0   Accuracy:  0.8875 Total time : 198.78315202899103 s
Polynomial kernel: 
C =  45.0   Accuracy:  0.889 Total time : 198.31418123900949 s
Polynomial kernel: 
C =  50.0   Accuracy:  0.8865 Total time : 197.35751045899815 s
Polynomial kernel: 
C =  55.0   Accuracy:  0.887 Total time : 198.02893001399934 s
Polynomial kernel: 
C =  60.0   Accuracy:  0.887 Total time : 198.02219055399473 s
Polynomial kernel: 
C =  100.0   Accuracy:  0.886 Total time : 197.26070878900646 s


In [7]:
# rbf kernel with different C
rbf_C = [1.0,5.0,10.0,20.0,25.0,30.0,40.0,45.0,50.0,55.0,60.0,100.0]
print("RBF kernel: ")
for i in rbf_C:
    start_time = timeit.default_timer()     
    #predict the result
    clf = SVC(C=i, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.0001, verbose=False)
    clf.fit(data_xs,data_y)
    print("C = ",i,"  Accuracy: ",clf.score(data_t_xs,label_test), "Total time :", timeit.default_timer() - start_time , 's')

RBF kernel: 
C =  1.0   Accuracy:  0.874 Total time : 214.47822553300648 s
C =  5.0   Accuracy:  0.886 Total time : 202.9001193419972 s
C =  10.0   Accuracy:  0.889 Total time : 207.3910313109809 s
C =  20.0   Accuracy:  0.891 Total time : 206.89166151802056 s
C =  25.0   Accuracy:  0.889 Total time : 207.39672010601498 s
C =  30.0   Accuracy:  0.8875 Total time : 205.33519326199894 s
C =  40.0   Accuracy:  0.8855 Total time : 206.40442605901626 s
C =  45.0   Accuracy:  0.8855 Total time : 207.9861193280085 s
C =  50.0   Accuracy:  0.887 Total time : 207.1897000800236 s
C =  55.0   Accuracy:  0.8865 Total time : 207.8966277760046 s
C =  60.0   Accuracy:  0.888 Total time : 207.28499795598327 s
C =  100.0   Accuracy:  0.889 Total time : 206.9957133360149 s


In [7]:
# Linear kernel with different C
lin_C = [1.0,5.0,10.0]
print("linear kernel")
for i in lin_C:
    start_time = timeit.default_timer()     
    #predict the result
    clf = SVC(C=i, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.0001, verbose=False)
    clf.fit(data_xs,data_y)
    print("C = ",i,"  Accuracy: ",clf.score(data_t_xs,label_test), "Total time :", timeit.default_timer() - start_time , 's')
    

In [12]:
# rbf kernel with different tolerance
tol = [0.1,0.001,0.0001]
for i in tol:
    start_time = timeit.default_timer()     
    #predict the result
    clf = SVC(C=20, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=i, verbose=False)
    clf.fit(data_xs,data_y)
    clf.score(data_xs,data_y)
    print("tol = ",i,"  Accuracy: ",clf.score(data_t_xs,label_test), "Total time :", timeit.default_timer() - start_time , 's')

tol =  0.1   Accuracy:  0.891 Total time : 541.5213292080152 s
tol =  0.001   Accuracy:  0.891 Total time : 548.540484208992 s
tol =  0.0001   Accuracy:  0.891 Total time : 564.2143920719973 s
