In [None]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE

In [None]:
from sklearn import svm
from time import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
mnist = pd.read_csv("resources/data.csv")


In [None]:
# Selecting a subset of mnist

data = mnist[:20000]

In [None]:
data.shape

In [None]:
# Preprocessing the data

X = data.drop(columns='label')
y = data['label']

In [None]:
#Normalazing the Data 

X = X / 255.0
y = y.astype("int64")

In [None]:
X

In [None]:
# Def for visualazing a single imaze from our dataset

def viz(n):
    number = X.iloc[n, :]
    number.shape
    number = number.values.reshape(28,28)
    plt.imshow(number, cmap='gray')
    plt.title("Digit")
    return

In [None]:
viz(34)

In [None]:
# We scaling our data into 0 and 1 

from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X = mm.fit_transform(X)


In [None]:
# Deviting our data into train and test set to perform our SVM model.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
# We will perform PCA in our X_train and X_test to reduse dimensions. We must keep 90% of our information
#So to achieve that we perform this:

pca = PCA(0.9)
pca.fit(X_train)
pca.n_components_

In [None]:
# As we see if we reduse our dimensions from 784 to 87 we can work much faster and still get more than 90% of our data

pca = PCA(n_components=87)
# We apply the dimensional reduction in both X_train and X_test 

X_trainpca = pca.fit_transform(X_train)
X_testpca = pca.transform(X_test)

In [None]:
# Now lets plot using t-SNE our data to see how the train test is:
view = TSNE(n_components=2, random_state=123).fit_transform(X_trainpca)
plt.figure(figsize=(20,10))
plt.scatter(view[:,0], view[:,1], c=y_train, alpha=0.5)
plt.xlabel('t-SNE-1')
plt.ylabel('t-SNE-2')

In [None]:
# Defining Hyperparametrs for our model:
paramC = [0.001,0.01,0.1,1,5,10]
gamm = [0.01,0.1,1]



In [None]:
# Start building our svm with linear kernel:
Cl=[]
Al=[]
Act1=[]

for i in paramC:
    start = time()
    clf = svm.SVC(kernel = 'linear', C=i)
    clf.fit(X_trainpca, y_train)
    accu = round(accuracy_score(y_train, clf.predict(X_trainpca)),4)*100
    accutest = round(accuracy_score(y_test, clf.predict(X_testpca)),4)*100
    end = time()
    Cl.append(i)
    Al.append((accu))
    Act1.append(accutest)
    print("Done in=", int((end-start)//1),"   Accuracy Train=",accu,"Accuracy Test",accutest,"    for C=",i )


In [None]:
#Making the results in a DataFrame format in order to be able to extrack informations.
tl=[0.55,0.27,0.18,0.17,0.26,0.37]
resultslin = pd.DataFrame({'time(s) ' : tl,
                         'C' : Cl,
                        "Accuracy Score Train" : Al,
                          "Accuracy Score Test":Act1},
                        columns=['time(s) ','C',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
resultslin.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots for linear kernel 

plt.figure(figsize=(10,6))
plt.plot(resultslin["C"], resultslin["Accuracy Score Train"])
plt.plot(resultslin["C"], resultslin["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Kernel Linear")
plt.ylim(60,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

In [None]:
# Start building our svm with polynomial kernel:
Cp=[]
Gp=[]
Ap=[]
Act2=[]
for i in paramC:
    for k in gamm:
        start = time()
        clf = svm.SVC(kernel = 'poly', C=i, gamma=k)
        clf.fit(X_trainpca, y_train)
        acc = round(accuracy_score(y_train, clf.predict(X_trainpca)),4)*100
        accutest = round(accuracy_score(y_test, clf.predict(X_testpca)),4)*100
        end = time()
        Cp.append(i)
        Gp.append(k)
        Ap.append(acc)
        Act2.append(accutest)
        print("Done in=", (end-start)//1,"   Accuracy score=",acc,"Accuracy Test",accutest,"for C=",i,"for gamma=",k)


In [None]:
#Resylts for the polynomial Kernel

tp=[2.27,1.04,0.36,0.36,0.35,2.22,0.38,0.35,0.35,0.35,2.02,0.35,0.35,0.35,0.35,1.18,0.35,0.35]
resultspoly = pd.DataFrame({'time' : tp,
                           'C' : Cp,
                           'Gamma' : Gp,
                           "Accuracy Score Train" : Ap,
                           "Accuracy Score Test":Act2},
                        columns=['time','C','Gamma',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
resultspoly.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots for Polynomial Kernel 
plt.figure(figsize=(16,8))
#                                  for the gamma value 0.01
plt.subplot(131)
gamma1 = resultspoly[resultspoly['Gamma']==0.01]
plt.plot(gamma1["C"], gamma1["Accuracy Score Train"])
plt.plot(gamma1["C"], gamma1["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 0.01")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")
#                                For the gamma value 0.1
plt.subplot(132)
gamma2 = resultspoly[resultspoly['Gamma']==0.1]
plt.plot(gamma2["C"], gamma2["Accuracy Score Train"])
plt.plot(gamma2["C"], gamma2["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 0.1")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")
#                           for gamma value 1
plt.subplot(133)
gamma3 = resultspoly[resultspoly['Gamma']==1]
plt.plot(gamma3["C"], gamma3["Accuracy Score Train"])
plt.plot(gamma3["C"], gamma3["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 1")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")


In [None]:
# Start building our svm with RBF  kernel:

Cr=[]
Gr=[]
Ar=[]
Acr=[]
for i in paramC:
    for k in gamm:
        start1 = time()
        clf = svm.SVC(kernel = 'rbf', C=i, gamma=k)
        clf.fit(X_trainpca, y_train)
        accutest = round(accuracy_score(y_test, clf.predict(X_testpca)),4)*100
        acc = round(accuracy_score(y_train, clf.predict(X_trainpca)),4)*100
        end1 = time()
        Cr.append(i)
        Gr.append(k)
        Ar.append(acc)
        Acr.append(accutest)        
        print("Done in=", int((end1-start1)//1),"Accuracy score=",acc,"Accuracy Test",accutest,"for C=",i,"for gamma=",k)


In [None]:
#Results for RBF Kernel
tr=[3.15,3.08,3.00,2.57,3.25,2.30,2.45,2.50,2.48,3.27,0.57,2.02,2.50,3.00,3.42,0.27,2.37,3.27]
resultsrbf = pd.DataFrame({'time' : tr,
                         'C' : Cr,
                        'Gamma' : Gr ,
                       "Accuracy Score Train" : Ar,
                       "Accuracy Score Test":Acr},
                        columns=['time','C', 'Gamma',"Accuracy Score Train","Accuracy Score Test"])
resultsrbf.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots for linear kernel 
plt.figure(figsize=(16,8))
plt.subplot(131)
gam1 = resultsrbf[resultsrbf['Gamma']==0.01]
plt.plot(gam1["C"], gam1["Accuracy Score Train"])
plt.plot(gam1["C"], gam1["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 0.01")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

plt.subplot(132)
gam2 = resultsrbf[resultsrbf['Gamma']==0.1]
plt.plot(gam2["C"], gam2["Accuracy Score Train"])
plt.plot(gam2["C"], gam2["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 0.1")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

plt.subplot(133)
gam3 = resultsrbf[resultsrbf['Gamma']==1]
plt.plot(gam3["C"], gam3["Accuracy Score Train"])
plt.plot(gam3["C"], gam3["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Gamma 1")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

In [None]:
#Total results for Svm model.

Kernel = ["Linear", "Polynomial","RBF"]
timeer = [0.27, 0.35,3.27]
C = [0.01,5,1]
G = ["-","0.01","0.01"]
As =[93.64,98.82,97.56]
At=[92.59,96.80,95.91]
results12 = pd.DataFrame({'Kernel':Kernel,
                          'Time(s) ':timeer,
                        'C' : C,
                        'Gamma' : G ,
                        'Accuracy Score Train' : As,
                         "Accuracy Score Test":At},
                        columns=['Kernel','Time(s) ','C', 'Gamma',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
results12 

In [None]:
# Now we will see the results for our best kernel for both train and test set. 
clf = svm.SVC(kernel="poly", C=5,gamma=0.01)
clf.fit(X_trainpca, y_train)
#Predictions
y_train_pred = clf.predict(X_trainpca)
y_test_pred = clf.predict(X_testpca)
#Accuracy score
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,y_train_pred)*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,y_test_pred)*100))

In [None]:
# Confusion matrix for X_train

CMTrain = pd.DataFrame(confusion_matrix(y_true=y_train,y_pred=y_train_pred))
                  

In [None]:
CMTrain

In [None]:
# Confusion matrix for X_test
CMTest = pd.DataFrame(confusion_matrix(y_test,y_test_pred))
CMTest

In [None]:
print(classification_report(y_test,y_test_pred))
             

In [None]:
#Cross validation for the k-NN classifier
X_traind,X_val,y_traind,y_val = train_test_split(X_trainpca,y_train,test_size=0.1,random_state=84)

In [None]:
#k-NN classification 

kn = np.arange(1,30,2)
AcValid=[]
start=time()
for k in kn:
    
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_traind,y_traind)
    score = round(knn.score(X_val, y_val),4)
    AcValid.append(score*100)
    end = time()
    print("Done in:",(end-start)//1,"k=%d,  accuracy=%.2f%%" % (k, score * 100))
                          

In [None]:
Tr = [0.04,0.10,0.15,0.21,0.28,0.34,0.40,0.46,0.53,0.60,1.06,1.13,1.19,1.26,1.33]
resultsknn = pd.DataFrame({'time(s) ' : Tr,
                        'n_Neighbors' : kn ,
                       "Accuracy score" : AcValid},
                        columns=['time(s) ', 'n_Neighbors',"Accuracy score"])

resultsknn.sort_values("Accuracy score",ascending=True)

In [None]:
#k-NN classifier 

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_trainpca,y_train)
y_testpred = knn.predict(X_testpca)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,knn.predict(X_trainpca))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,knn.predict(X_testpca))*100))
print(classification_report(y_test,knn.predict(X_testpca)))

In [None]:
#Nearest Centroid classification 
metrics=["euclidean","manhattan"]
n_param = (0.001,0.01,0.1,1)
AcNc=[]
start=time()
for i in metrics: 
    for k in n_param:
        nc = NearestCentroid(metric=i,shrink_threshold=k)
        nc.fit(X_traind,y_traind)
        score = round(accuracy_score(y_val,nc.predict(X_val)),4)
        AcNc.append(score*100)
        end = time()
        print("Done in:",(end-start)//1,"metric =",i,"thresholds=",k ,"   Accuracy:",(score *100),"%")

In [None]:
# Nearest Centroid
nc = NearestCentroid()
nc.fit(X_trainpca,y_train)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,nc.predict(X_trainpca))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,nc.predict(X_testpca))*100))
print(classification_report(y_test,nc.predict(X_testpca)))

In [None]:
# KPCA+ LDA method using k-NN classifier.

co = np.arange(1,750,50)
Acc=[]
Acct=[]
for i in co:
    start = time()
    # KPCA with rbf kernel 
    kp = KernelPCA(n_components=i,kernel='rbf')
    X_trainkpca = kp.fit_transform(X_train)
    X_testkpca = kp.transform(X_test)
    
    # LDA using n_components as 
    ld = LinearDiscriminantAnalysis()
    X_trainl = ld.fit_transform(X_trainkpca,y_train)
    X_testl = ld.transform(X_testkpca)

         # fiting our model using k-NN classifier 
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_trainl,y_train)
    #Calculating accuracy
    accTr = round(accuracy_score(y_train,knn.predict(X_trainl)),4)*100
    accT = round(accuracy_score(y_test,knn.predict(X_testl)),4)*100
    Acc.append(accTr)
    Acct.append(accT)
    end = time()
    print("Done in: ",(end-start)//1,"Accuracy train : ",accTr,"Accuracy test : ",accT,"neighbors ",i,)
        


In [None]:
#Results for KPCA-LDA for k-NN classification 
tkp=[1.13,11.15,11.18,11.02,11.08,12.23,16.25,15.35,9.27,15.02,13.22,15.42,15.10,15.17,13.28]
resultskn = pd.DataFrame({'time(s) ' : tkp,
                           "components":co,
                       "Accuracy Score Train" : Acc,
                          "Accuracy Score Test":Acct},
                        columns=['time(s) ','components',"Accuracy Score Train","Accuracy Score Test"])
resultskn.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# KPCA+ LDA method using Nearest Centroid.

co = np.arange(1,750,50)
Anc=[]
Anct=[]
for i in co:
    start = time()
    
    # KPCA with rbf kernel 
    kp = KernelPCA(n_components=i,kernel='rbf')
    X_trainkpca = kp.fit_transform(X_train)
    X_testkpca = kp.transform(X_test)
    
    # LDA using undefined n_components 
    ld = LinearDiscriminantAnalysis()
    X_trainl = ld.fit_transform(X_trainkpca,y_train)
    X_testl = ld.transform(X_testkpca)
    
    # fiting our model using Nearest Centriod classifier 
    nc = NearestCentroid()
    nc.fit(X_trainl,y_train)
    #calculating accuracy 
    accTr = round(accuracy_score(y_train,nc.predict(X_trainl)),4)*100
    accT = round(accuracy_score(y_test,nc.predict(X_testl)),4)*100
    Anc.append(accTr)
    Anct.append(accT)
    end = time()
    print("Done in: ",(end-start)//1,"Accuracy train : ",accTr,"Accuracy test : ",accT)

In [None]:
#Results for KPCA-LDA for Nearest centroid classification 
trn=[0.48,13.05,15.53,8.12,8.13,14.50,17.23,14.02,11.02,10.25,9.22,10.22,10.52,13.55]
resultsnc = pd.DataFrame({'time(s) ' : trn,
                           "components":co,
                       "Accuracy Score Train" : Anc,
                          "Accuracy Score Test":Anct},
                        columns=['time(s) ',"components","Accuracy Score Train","Accuracy Score Test"])
resultsnc.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots

plt.figure(figsize=(10,6))
#KPCA+LDA ploting for accuracy scores using k-NN classifier
plt.subplot(1,2,1)
plt.plot(resultskn["components"], resultskn["Accuracy Score Train"])
plt.plot(resultskn["components"], resultskn["Accuracy Score Test"])
plt.xlabel('components')
plt.ylabel('Accuracy')
plt.title("KPCA+LDA k-NN")
plt.ylim(60,100)
plt.legend(['Accuracy Train','Accuracy Test'])
#KPCA+LDA ploting for accuracy scores using Nearest Centroid classifier
plt.subplot(1,2,2)
plt.plot(resultsnc["components"], resultsnc["Accuracy Score Train"])
plt.plot(resultsnc["components"], resultsnc["Accuracy Score Test"])
plt.xlabel('components')
plt.ylabel('Accuracy')
plt.title("KPCA+LDA Nearest Centroid")
plt.ylim(60,100)
plt.legend(['Accuracy Train','Accuracy Test'])

In [None]:
    # KPCA + LDA k-NN classifier 
kp = KernelPCA(n_components=701,kernel='rbf')
X_trainkpca = kp.fit_transform(X_train)
X_testkpca = kp.transform(X_test)
    
#LDA 
ld = LinearDiscriminantAnalysis()
X_trainl = ld.fit_transform(X_trainkpca,y_train)
X_testl = ld.transform(X_testkpca)
    
#fiting our model 
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_trainl,y_train)
y_tpred = knn.predict(X_testl)
    #Calculating accuracy
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,knn.predict(X_trainl))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,knn.predict(X_testl))*100))
print(classification_report(y_test,y_tpred))

In [None]:
    # KPCA + LDA Nearest Centroid 
kp = KernelPCA(n_components=651,kernel='rbf')
X_trainkpca = kp.fit_transform(X_train)
X_testkpca = kp.transform(X_test)
    
ld = LinearDiscriminantAnalysis()
X_trainl = ld.fit_transform(X_trainkpca,y_train)
X_testl = ld.transform(X_testkpca)
    

nc = NearestCentroid()
nc.fit(X_trainl,y_train)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,nc.predict(X_trainl))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,nc.predict(X_testl))*100))
print(classification_report(y_test,nc.predict(X_testl)))

In [None]:
#The final results for all the methods. 

Method = ["SVM", "k-NN","Nearest Centroid","KPCA+LDA k-NN","KPCA+LDA Nearest Centroid"]
timeer = [0.35,0.10,0.18,15.37,13.12]
At = [98.82,98.02,80.94,97.66,96.29]
Atest = [96.80,96.12,81.00,95.41,94.79]
resultsfinal = pd.DataFrame({'Method':Method,
                          'Time(s) ':timeer,
                        'Accuracy Train Set' : At,
                         "Accuracy Test Set":Atest},
                        columns=['Method','Time(s) ','Accuracy Train Set',"Accuracy Test Set"])
resultsfinal