In [None]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import TSNE

In [None]:
from sklearn import svm
from time import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
mnist = pd.read_csv("resources/data.csv")


In [None]:
# Selecting a subset of mnist

data = mnist[:10000]

In [None]:
data.shape

In [None]:
# Preprocessing the data

X = data.drop(columns='label')
y = data['label']

In [None]:
#Normalazing the Data 

X = X / 255.0
y = y.astype("int64")

In [None]:
# Creating a binary problem by defining the ground truth ('label') as a set of single and double numbers

for i in range(len(y)):
    if (y[i]%2==0):
        
        y[i] = 0
    else:
        y[i] = 1


In [None]:
# Def for visualazing a single imaze from our dataset

def viz(n):
    number = X.iloc[n, :]
    number.shape
    number = number.values.reshape(28,28)
    plt.imshow(number, cmap='gray')
    plt.title("Digit")
    return

In [None]:
viz(87)

In [None]:
# We scaling our data into 0 and 1 

from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X = mm.fit_transform(X)


In [None]:
# Deviting our data into train and test set to perform our SVM model.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
# We will perform PCA in our X_train and X_test to reduse dimensions. We must keep 90% of our information
#So to achieve that we perform this:

pca = PCA(0.9)
pca.fit(X_train)
pca.n_components_

In [None]:
# As we see if we reduse our dimensions from 784 to 87 we can work much faster and still get more than 90% of our data

pca = PCA(n_components=86)
# We apply the dimensional reduction in both X_train and X_test 

X_trainpca = pca.fit_transform(X_train)
X_testpca = pca.transform(X_test)

In [None]:
# Now lets plot using t-SNE our data to see how the train test is:
view = TSNE(n_components=2, random_state=123).fit_transform(X_trainpca)
plt.figure(figsize=(20,10))
plt.scatter(view[:,0], view[:,1], c=y_train, alpha=0.5)
plt.xlabel('t-SNE-1')
plt.ylabel('t-SNE-2')

In [None]:
# Defining Hyperparametrs for our model:
paramC = [0.001, 0.01, 0.1,  5, 10]
gamm = ['auto','scale']
deg = [3,6,8]


In [None]:
# Start building our svm with linear kernel:
Cl=[]
Al=[]
Atl=[]
for i in paramC:
    start = time()
    clf = svm.SVC(kernel = 'linear', C=i)
    clf.fit(X_trainpca, y_train)
    accu = round(accuracy_score(y_train, clf.predict(X_trainpca)),4)
    acct = round( accuracy_score(y_test, clf.predict(X_testpca)),4)
    end = time()
    Cl.append(i)
    Al.append(accu*100)
    Atl.append(acct*100)
    print("Done in=", (end-start)//1,"   Accuracy score=",accu*100,"Αccuracy Test",acct*100,"for C=",i,acct*100 )

In [None]:
# Results for Linear Kernel
tl=[0.43,0.30,0.23,1.23,1.54]
resultslin = pd.DataFrame({'time(s) ' : tl,
                         'C' : Cl,
                        "Accuracy Score Train" : Al,
                          "Accuracy Score Test" : Atl},
                        columns=['time(s) ','C',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
resultslin.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots for linear kernel 
plt.figure(figsize=(8,6))
plt.plot(resultslin["C"], resultslin["Accuracy Score Train"])
plt.plot(resultslin["C"], resultslin["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Kernel Linear")
plt.ylim(60,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

In [None]:
# Start building our svm with polynomial kernel:
Cp=[]
Gp=[]
Dp=[]
Ap=[]
Apt=[]
for i in paramC:
    for k in gamm:
        for j in deg:
            start = time()
            clf = svm.SVC(kernel = 'poly', C=i, gamma=k,degree=j)
            clf.fit(X_trainpca, y_train)
            acc = round( accuracy_score(y_train, clf.predict(X_trainpca)),4)
            acct = round( accuracy_score(y_test, clf.predict(X_testpca)),4)
            end = time()
            Cp.append(i)
            Gp.append(k)
            Dp.append(j)
            Ap.append((acc*100))
            Apt.append(acct*100)
            print("Done in=", (end-start)//1,"Accuracy score=",acc*100,"Accuracy Test",acct*100,"for C=",i,"for gamma=",k," for degree=",j)

In [None]:
# Results for Polynomial Kernel
tpo=[1.20,1.14,0.35,0.36,0.46,1.19,1.17,1.19,1.19,1.12,1.17,1.12,0.33,0.40,0.43,1.04,1.12,0.25,1.02,1.12,0.22,0.45,1.02,0.25,0.50,0.40,0.08,0.24,0.23]
resultspoly = pd.DataFrame({'time(s) ':tpo,
                           'C' : Cp,
                           'Gamma' : Gp,
                           "Degree" :Dp,
                           "Accuracy Score Train" : Ap,
                           "Accuracy Score Test":Apt},
                        columns=['time(s) ','C','Gamma',"Degree","Accuracy Score Train","Accuracy Score Test"])

In [None]:
resultspoly.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots for Polynomial Kernel 
plt.figure(figsize=(19,6))

plt.subplot(131)
D1 = resultspoly[resultspoly['Degree']==3]
plt.plot(D1["C"], D1["Accuracy Score Train"])
plt.plot(D1["C"], D1["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Degree = 3")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

plt.subplot(132)
D2 = resultspoly[resultspoly['Degree']==6]
plt.plot(D2["C"], D2["Accuracy Score Train"])
plt.plot(D2["C"], D2["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Degree = 6")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

plt.subplot(133)
D3 = resultspoly[resultspoly['Degree']==8]
plt.plot(D3["C"], D3["Accuracy Score Train"])
plt.plot(D3["C"], D3["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Degree = 8")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")


In [None]:
# Start building our svm with RBF  kernel:

Cr=[]
Gr=[]
Ar=[]
Atr=[]
for i in paramC:
    for k in gamm:
        start1 = time()
        clf = svm.SVC(kernel = 'rbf', C=i, gamma=k)
        clf.fit(X_trainpca, y_train)
        accu = round(accuracy_score(y_train, clf.predict(X_trainpca)),4)
        acct = round( accuracy_score(y_test, clf.predict(X_testpca)),4)
        end1 = time()
        Cr.append(i)
        Gr.append(k)
        Ar.append((accu*100))
        Atr.append(acct*100)        
        print("Done in=", (end1-start1)//1,"Accuracy score Train=",accu*100,"Accuracy Score Test",acct*100,"for C=",i,"for gamma=",k)

In [None]:
#Results for RBF Kernel

tr=[1.09,1.13,1.05,1.06,0.37,0.37,0.13,0.22,0.18,0.18]
resultsrbf = pd.DataFrame({'time(s) ' : tr,
                         'C' : Cr,
                        'Gamma' : Gr ,
                       "Accuracy Score Train" : Ar,
                          "Accuracy Score Test":Atr},
                        columns=['time(s) ','C', 'Gamma',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
resultsrbf.sort_values("Accuracy Score Test",ascending=True)


In [None]:
# Resulting plots for RBF kernel 
plt.figure(figsize=(12,6))
plt.subplot(121)
gam1 = resultsrbf[resultsrbf['Gamma']=="auto"]
plt.plot(gam1["C"], gam1["Accuracy Score Train"])
plt.plot(gam1["C"], gam1["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Auto")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")

plt.subplot(122)
gam2 = resultsrbf[resultsrbf['Gamma']=="scale"]
plt.plot(gam2["C"], gam2["Accuracy Score Train"])
plt.plot(gam2["C"], gam2["Accuracy Score Test"])
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Scale")
plt.ylim(10,100)
plt.legend(['Accuracy Train','Accuracy Test'])
plt.xscale("log")


In [None]:
#Results for the Best Accuracy Scores

Kernel = ["Linear", "Polynomial","RBF"]
timeer = [1.10, 1.38,1.32]
C = [0.01,0.1,10]
G = ["-","scale","scale"]
D = ["-","3","-"]
As =[88.32,96.43,99.97]
At= [88.38,94.75,97.85]
results12 = pd.DataFrame({'Kernel':Kernel,
                          'Time':timeer,
                        'C' : C,
                        'Gamma' : G ,
                        'Degree' : D,
                         "Accuracy Score Train":As,
                         "Accuracy Score Test":At,},
                        columns=['Kernel','Time','C', 'Gamma','Degree',"Accuracy Score Train","Accuracy Score Test"])

In [None]:
results12 

In [None]:
# Now we will see the results for our best kernel for both train and test set. 
clf = svm.SVC(kernel="rbf", C=10,gamma="scale")
clf.fit(X_trainpca, y_train)
y_train_pred = clf.predict(X_trainpca)
y_test_pred = clf.predict(X_testpca)

In [None]:
print(round(accuracy_score(y_train,y_train_pred),4)*100)

In [None]:
print(round(accuracy_score(y_test,y_test_pred),4)*100)

In [None]:
# Confusion matrix for X_train

CMTrain = pd.DataFrame(confusion_matrix(y_true=y_train,y_pred=y_train_pred),
                  columns=pd.MultiIndex.from_product([["Prediction"],['Negative',"Positive"]]),
                  index=pd.MultiIndex.from_product([["Actual"],["Negative","Positive"]]))

In [None]:
CMTrain

In [None]:
# Confusion matrix for X_test
CMTest = pd.DataFrame(confusion_matrix(y_test,y_test_pred),
                  columns=pd.MultiIndex.from_product([["Prediction"],['Negative',"Positive"]]),
                  index=pd.MultiIndex.from_product([["Actual"],["Negative","Positive"]]))

In [None]:
CMTest

In [None]:
print(classification_report(y_test,y_test_pred))
             

In [None]:
#Cross validation for the k-NN classifier
X_traind,X_val,y_traind,y_val = train_test_split(X_trainpca,y_train,test_size=0.1,random_state=84)

In [None]:
#k-NN classification 

kn = np.arange(1,30,2)
AcValid=[]
start=time()
for k in kn:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_traind,y_traind)
    score = round(knn.score(X_val, y_val),4)
    AcValid.append(score*100)
    end = time()
    print("Done in:",(end-start)//1,"k=%d, accuracy=%.2f%%" % (k, score * 100))

In [None]:
#Nearest Centroid classification 
metrics=["euclidean","manhattan"]
n_param = (0.001,0.01,0.1,1)
AcNc=[]
start=time()
for i in metrics: 
    for k in n_param:
        nc = NearestCentroid(metric=i,shrink_threshold=k)
        nc.fit(X_traind,y_traind)
        score = round(accuracy_score(y_val,nc.predict(X_val)),4)
        AcNc.append(score*100)
        end = time()
        print("Done in:",(end-start)//1,"metric =",i,"thresholds=",k ,"   Accuracy:",(score *100),"%")

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_trainpca,y_train)
y_testpred = knn.predict(X_testpca)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,knn.predict(X_trainpca))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,knn.predict(X_testpca))*100))
print(classification_report(y_test,y_testpred))

In [None]:
nc = NearestCentroid()
nc.fit(X_trainpca,y_train)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,nc.predict(X_trainpca))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,nc.predict(X_testpca))*100))
print(classification_report(y_test,nc.predict(X_testpca)))

In [None]:
# KPCA+ LDA method using k-NN classifier.
kn=np.arange(1,20,2)
co = np.arange(1,700,50)
Acc=[]
Acct=[]
knp=[]
for i in co:
    
    # KPCA with rbf kernel 
    start = time()
    kp = KernelPCA(n_components=i,kernel='rbf')
    X_trainkpca = kp.fit_transform(X_train)
    X_testkpca = kp.transform(X_test)
    
    # LDA using n_components as 
    ld = LinearDiscriminantAnalysis(n_components=1)
    X_trainl = ld.fit_transform(X_trainkpca,y_train)
    X_testl = ld.transform(X_testkpca)
    bestT=0
    # fiting our model using k-NN classifier 
    for k in kn:
        
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_trainl,y_train)
    #Calculating accuracy
        accTr = round(accuracy_score(y_train,knn.predict(X_trainl)),4)*100
        accT = round(accuracy_score(y_test,knn.predict(X_testl)),4)*100
        if(accT>bestT):
            bestTr=accTr
            bestT=accT
            knnp=k
    Acc.append(bestTr)
    Acct.append(bestT)
    knp.append(knnp)
    end = time()
    print("Done in: ",(end-start)//1,"Accuracy train : ",bestTr,"Accuracy test : ",bestT,"neighbors:",knnp)

In [None]:
#Results for KPCA-LDA for k-NN classification 

tkp=[0.44,3.38,3.03,3.22,2.35,1.90,2.18,3.45,3.53,3.38,2.22,2.17,4.37,4.20]
resultskn = pd.DataFrame({'time(s) ' : tkp,
                           "components":co,
                          "neighbors":knp,
                       "Accuracy Score Train" : Acc,
                          "Accuracy Score Test":Acct},
                        columns=['time(s) ','components',"neighbors","Accuracy Score Train","Accuracy Score Test"])
resultskn.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# KPCA+ LDA method using Nearest Centroid.

co = np.arange(1,700,50)
Anc=[]
Anct=[]
for i in co:
    start = time()
    # KPCA with rbf kernel 
    kp = KernelPCA(n_components=i,kernel='rbf')
    X_trainkpca = kp.fit_transform(X_train)
    X_testkpca = kp.transform(X_test)
    
    # LDA using n_components as 
    ld = LinearDiscriminantAnalysis(n_components=1)
    X_trainl = ld.fit_transform(X_trainkpca,y_train)
    X_testl = ld.transform(X_testkpca)
    
    # fiting our model using Nearest Centriod classifier 
    nc = NearestCentroid()
    nc.fit(X_trainl,y_train)
    #calculating accuracy 
    accTr = round(accuracy_score(y_train,nc.predict(X_trainl)),4)*100
    accT = round(accuracy_score(y_test,nc.predict(X_testl)),4)*100
    Anc.append(accTr)
    Anct.append(accT)
    end = time()
    print("Done in: ",(end-start)//1,"Accuracy train : ",accTr,"Accuracy test : ",accT)

In [None]:
#Results for KPCA-LDA for Nearest centroid classification 

trn=[0.29,2.88,3.83,3.07,2.18,2.22,3.10,3.12,3.55,3.43,2.13,2.52,4.37,4.42]
resultsnc = pd.DataFrame({'time(s) ' : trn,
                           "components":co,
                       "Accuracy Score Train" : Anc,
                          "Accuracy Score Test":Anct},
                        columns=['time(s) ',"components","Accuracy Score Train","Accuracy Score Test"])
resultsnc.sort_values("Accuracy Score Test",ascending=True)

In [None]:
# Resulting plots

plt.figure(figsize=(10,6))
#KPCA+LDA ploting for accuracy scores using k-NN classifier
plt.subplot(1,2,1)
plt.plot(resultskn["components"], resultskn["Accuracy Score Train"])
plt.plot(resultskn["components"], resultskn["Accuracy Score Test"])
plt.xlabel('components')
plt.ylabel('Accuracy')
plt.title("KPCA+LDA k-NN")
plt.ylim(40,100)
plt.plot()
plt.legend(['Accuracy Train','Accuracy Test'])

#KPCA+LDA ploting for accuracy scores using Nearest Centroid classifier
plt.subplot(1,2,2)
plt.plot(resultsnc["components"], resultsnc["Accuracy Score Train"])
plt.plot(resultsnc["components"], resultsnc["Accuracy Score Test"])
plt.xlabel('components')
plt.ylabel('Accuracy')
plt.title("KPCA+LDA Nearest Centroid")
plt.ylim(40,100)
plt.legend(['Accuracy Train','Accuracy Test'])

In [None]:
    # KPCA + LDA k-NN classifier 
kp = KernelPCA(n_components=651,kernel='rbf')
X_trainkpca = kp.fit_transform(X_train)
X_testkpca = kp.transform(X_test)
    

ld = LinearDiscriminantAnalysis(n_components=1)
X_trainl = ld.fit_transform(X_trainkpca,y_train)
X_testl = ld.transform(X_testkpca)
    
 
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_trainl,y_train)
    #Calculating accuracy
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,knn.predict(X_trainl))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,knn.predict(X_testl))*100))
print(classification_report(y_test,knn.predict(X_testl)))

In [None]:
    # KPCA + LDA Nearest Centroid 
kp = KernelPCA(n_components=651,kernel='rbf')
X_trainkpca = kp.fit_transform(X_train)
X_testkpca = kp.transform(X_test)
    
ld = LinearDiscriminantAnalysis(n_components=1)
X_trainl = ld.fit_transform(X_trainkpca,y_train)
X_testl = ld.transform(X_testkpca)
    

nc = NearestCentroid()
nc.fit(X_trainl,y_train)
print('Accuracy Score on Train Set: {:.2f}%'.format(accuracy_score(y_train,nc.predict(X_trainl))*100))
print('Accuracy Score on Test Set: {:.2f}%'.format(accuracy_score(y_test,nc.predict(X_testl))*100))
print(classification_report(y_test,nc.predict(X_testl)))

In [None]:
#The final results for all the methods. 

Method = ["SVM", "k-NN","Nearest Centroid","KPCA+LDA k-NN","KPCA+LDA Nearest Centroid"]
timeer = [1.32,0.54,0.18,4.20,4.42]
At = [99.98,98.85,80.83,97.67,97.43]
Atest = [98.81,98.28,82.08,95.67,95.80]
resultsfinal = pd.DataFrame({'Method':Method,
                          'Time(s) ':timeer,
                        'Accuracy Train Set' : At,
                         "Accuracy Test Set":Atest},
                        columns=['Method','Time(s) ','Accuracy Train Set',"Accuracy Test Set"])

In [None]:
resultsfinal