In [210]:
import pandas as pd
from scipy import sparse
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import time 
import sys
import math
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import average_precision
from scipy.io import loadmat
from matplotlib.colors import ListedColormap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
import random
import sys
# from sklearn.metrics import multilabel_confusion_matrix

In [164]:
def take_subsample(x,y,subsample_size):
    #subsample_size is the percentage of each class that we would sample
    #this way we make sure we have a balanced sample dataset from each class
    
    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
#         print ('elems',elems)
#         print ('yi',yi)
#         sys.exit(1)
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys

In [35]:
Xtest = loadmat('mnist_10digits.mat')['xtest'][:]
Xtrain = loadmat('mnist_10digits.mat')['xtrain'][:]
Ytest = loadmat('mnist_10digits.mat')['ytest'][0]
Ytrain = loadmat('mnist_10digits.mat')['ytrain'][0]

In [82]:
#standardize features between 0 - 1
std_Xtrain = Xtrain/255.0
std_Xtest = Xtest/255.0
std_Ytrain = Ytrain
std_Ytest = Ytest

In [218]:
labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [165]:
#resample data for SVM
x_train, y_train = take_subsample(std_Xtrain,std_Ytrain,0.3)

In [192]:
# determine the optimum number of neighbors in KNN and get the best classification

clf = KNeighborsClassifier()

# Create the parameters list you wish to tune
parameters = {'n_neighbors' : [2,4,8,10,20]}

# Make an fbeta_score scoring object
scorer = make_scorer(f1_score,average='weighted')

# Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(std_Xtrain, std_Ytrain)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(std_Xtrain, std_Ytrain)).predict(std_Xtest)
best_predictions = best_clf.predict(std_Xtest)

# Report the before-and-afterscores
print ("Unoptimized model\n------")
print ('f1  score on testing data: ' + str(f1_score(std_Ytest, predictions,average = None)))
print ("\nOptimized Model\n------")
print ("Final f1 score on the testing data: "+ str(f1_score(std_Ytest, best_predictions,average = None)))
print ("Final precision score on the testing data: "+str(precision_score(std_Ytest, best_predictions,average = None)))
print ("Final recall score on the testing data: "+str(recall_score(std_Ytest, best_predictions,average = None)))
print ("confusion matrix: ")
print (confusion_matrix(std_Ytest, svm_clf.predict(std_Xtest)))

print "parameters", best_clf.get_params

Unoptimized model
------
f1  score on testing data: [0.97840281 0.97588286 0.9710926  0.96538081 0.9687019  0.96582633
 0.98386257 0.96108949 0.96206533 0.95483871]

Optimized Model
------
Final f1 score on the testing data: [0.979428   0.97546276 0.97067449 0.96500739 0.96846389 0.96259073
 0.98329854 0.95782841 0.95906433 0.95883534]
Final precision score on the testing data: [0.96347483 0.9537037  0.97928994 0.96074583 0.96747967 0.95884316
 0.98329854 0.95458937 0.99448732 0.97151577]
Final recall score on the testing data: [0.99591837 0.99823789 0.9622093  0.96930693 0.9694501  0.96636771
 0.98329854 0.96108949 0.92607803 0.94648167]
confusion matrix: 
[[ 969    0    2    2    0    4    1    1    1    0]
 [   0 1122    3    2    0    1    3    1    3    0]
 [   6    0  991    5    6    0    5    8   11    0]
 [   0    0    5  968    0   12    0   10   12    3]
 [   1    1    4    0  951    0    7    2    1   15]
 [   5    1    2   14    2  851    6    1    7    3]
 [   8    3    1

In [219]:
#calculate SVM results with Gamma based on Median Trick

KNN = KNeighborsClassifier(n_neighbors=4)
KNN_clf = KNN.fit(x_train, y_train)

KNN_mat = confusion_matrix(std_Ytest, KNN_clf.predict(std_Xtest))
KNN_precision = precision_score(std_Ytest, KNN_clf.predict(std_Xtest),average='weighted')
KNN_recall = recall_score(std_Ytest, KNN_clf.predict(std_Xtest),average='weighted')
KNN_f1 = f1_score(std_Ytest, KNN_clf.predict(std_Xtest),average='weighted')

print "confusion matrix for K nearest neighbor is: ", KNN_mat
print "precision for K nearest neighbor is: ", KNN_precision, " recall: ", KNN_recall, " f1 score: ", KNN_f1
print (classification_report(Ytest, KNN_clf.predict(std_Xtest), labels=labels))

confusion matrix for K nearest neighbor is:  [[ 975    1    1    0    0    1    1    1    0    0]
 [   0 1132    2    0    0    0    1    0    0    0]
 [  24   20  955    3    2    0    3   19    6    0]
 [   1    4    4  970    1   14    0    6    6    4]
 [   0   17    1    0  936    0    3    0    1   24]
 [   6    2    0   28    3  838    7    1    2    5]
 [  10    4    0    0    3    5  936    0    0    0]
 [   0   32    3    0    6    1    0  975    0   11]
 [  10    3    4   25    7   34    5   10  870    6]
 [   9    7    3    6   12    6    1   19    1  945]]
precision for K nearest neighbor is:  0.9539728846464691  recall:  0.9532  f1 score:  0.9530565279426535
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       980
           1       0.99      0.97      0.98      1135
           2       0.95      0.96      0.96      1032
           3       0.93      0.95      0.94      1010
           4       0.96      0.94      0.95     

In [141]:
#calculate Gamma for SVM using Median trick
temp_distance= np.zeros(1000)

for k in range(1000):
    i  = random.randint(1, (x_train.shape[0]-1))
    
    j = random.randint(1, (x_train.shape[0]-1))
    if i==j:
        while i==j:
            j = random.ranint(1, x_train.shape[0]) 
    temp_distance[k] = sum(np.power((x_train[i,:] - x_train[j,:] ),2))

m = np.median(temp_distance)
sigma = (m/2.0)**0.5
Gamma = 0.5/(sigma**2)

In [220]:
#calculate SVM results with Gamma based on Median Trick

SVM = svm.SVC(kernel='rbf',gamma=Gamma)
svm_clf = SVM.fit(x_train, y_train)

SVM_mat = confusion_matrix(std_Ytest, svm_clf.predict(std_Xtest))
SVM_precision = precision_score(std_Ytest, svm_clf.predict(std_Xtest),average='weighted')
SVM_recall = recall_score(std_Ytest, svm_clf.predict(std_Xtest),average='weighted')
SVM_f1 = f1_score(std_Ytest, svm_clf.predict(std_Xtest),average='weighted')

print "confusion matrix for kernel SVM is: ", SVM_mat
print "precision for kernel SVM is: ", SVM_precision, " recall: ", SVM_recall, " f1 score: ", SVM_f1
print (classification_report(Ytest, svm_clf.predict(std_Xtest), labels=labels))

confusion matrix for kernel SVM is:  [[ 969    0    2    2    0    4    1    1    1    0]
 [   0 1122    3    2    0    1    3    1    3    0]
 [   6    0  991    5    6    0    5    8   11    0]
 [   0    0    5  968    0   12    0   10   12    3]
 [   1    1    4    0  951    0    7    2    1   15]
 [   5    1    2   14    2  851    6    1    7    3]
 [   8    3    1    1    2    7  933    0    3    0]
 [   0   12   21    2    4    0    0  967    3   19]
 [   5    0    6    8    7   10    7    6  923    2]
 [   6    5    3    8   23    6    0    8    3  947]]
precision for kernel SVM is:  0.9621663000557678  recall:  0.9622  f1 score:  0.9621418323782506
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.98      0.99      0.98      1135
           2       0.95      0.96      0.96      1032
           3       0.96      0.96      0.96      1010
           4       0.96      0.97      0.96       982
          

In [221]:
#linear SVM
SVM = svm.SVC(kernel='linear')
svm_clf = SVM.fit(x_train, y_train)

SVM_mat = confusion_matrix(Ytest, svm_clf.predict(std_Xtest))
SVM_precision = precision_score(Ytest, svm_clf.predict(std_Xtest),average='weighted')
SVM_recall = recall_score(Ytest, svm_clf.predict(std_Xtest),average='weighted')
SVM_f1 = f1_score(Ytest, svm_clf.predict(std_Xtest),average='weighted')

print "confusion matrix for SVM is: ", SVM_mat
print "precision for SVM is: ", SVM_precision, " recall: ", SVM_recall, " f1 score: ", SVM_f1
print (classification_report(Ytest, svm_clf.predict(std_Xtest), labels=labels))

confusion matrix for SVM is:  [[ 959    0    0    3    1    9    7    0    1    0]
 [   0 1115    3    2    0    2    2    1   10    0]
 [  12    2  952   17    8    5    7   11   18    0]
 [   4    2   18  938    1   22    0    5   14    6]
 [   2    1   11    0  927    0   11    5    1   24]
 [  14    3    5   50    6  772   12    3   21    6]
 [   7    4   12    2    9   13  906    2    3    0]
 [   0   11   20    6    7    0    0  950    0   34]
 [  14    2   16   24   13   25   10    8  854    8]
 [   5    6    8    9   45   10    1   25    5  895]]
precision for SVM is:  0.9267901208852755  recall:  0.9268  f1 score:  0.9265712338684554
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       980
           1       0.97      0.98      0.98      1135
           2       0.91      0.92      0.92      1032
           3       0.89      0.93      0.91      1010
           4       0.91      0.94      0.93       982
           5       0.90 

In [162]:
#determine the best hyperparameters for Logistics regression

clf = LogisticRegression(random_state=0)

# Create the parameters list you wish to tune
parameters = {'C':[100, 10, 1.0, 0.1, 0.01],
              'solver':['newton-cg', 'lbfgs', 'sag', 'saga']}


# Make an fbeta_score scoring object
scorer = make_scorer(f1_score,average='weighted')

# Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(std_Xtrain, std_Ytrain)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(std_Xtrain, std_Ytrain)).predict(std_Xtest)
best_predictions = best_clf.predict(std_Xtest)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "f1  score on testing data: {:.4f}".format(f1_score(std_Ytest, predictions,average='weighted'))
print "\nOptimized Model\n------"
print "Final f1 score on the testing data: {:.4f}".format(f1_score(std_Ytest, best_predictions,average='weighted'))
print "Final precision score on the testing data: {:.4f}".format(precision_score(std_Ytest, best_predictions,average='weighted'))
print "Final recall score on the testing data: {:.4f}".format(recall_score(std_Ytest, best_predictions,average='weighted'))
print "confusion matrix: "
print (confusion_matrix(y_test, svm_clf.predict(x_test)))

print "parameters", best_clf.get_params



Unoptimized model
------
f1  score on testing data: 0.9199

Optimized Model
------
Final f1 score on the testing data: 0.9196
Final precision score on the testing data: 0.9196
Final recall score on the testing data: 0.9198
confusion matrix: 
[[265   0   0   0   0   1   0   0   1   0]
 [  0 264   1   1   0   0   0   1   0   0]
 [  1   0 256   3   2   0   1   1   3   0]
 [  0   0   1 257   0   4   0   1   2   2]
 [  0   0   0   0 258   0   3   0   0   6]
 [  0   0   1   2   0 261   1   0   2   0]
 [  4   1   1   0   3   3 255   0   0   0]
 [  1   0   6   1   1   0   0 258   0   0]
 [  0   1   2   3   0   4   2   0 254   1]
 [  1   2   0   5   5   2   0   2   3 247]]
parameters <bound method LogisticRegression.get_params of LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)>


In [222]:
#logistics regression with default values

logReg = LogisticRegression(random_state=0).fit(std_Xtrain, Ytrain)
logReg_mat = confusion_matrix(Ytest, logReg.predict(std_Xtest))
logReg_precision = precision_score(Ytest, logReg.predict(std_Xtest),average='weighted')
logReg_recall = recall_score(Ytest, logReg.predict(std_Xtest),average='weighted')
logReg_f1 = f1_score(Ytest, logReg.predict(std_Xtest),average='weighted')

print "confusion matrix for logistics regression is: ", logReg_mat
print "precision for logistics regression is: ", logReg_precision, " recall: ", logReg_recall, " f1 score: ", logReg_f1
print (classification_report(Ytest, logReg.predict(std_Xtest), labels=labels))

confusion matrix for logistics regression is:  [[ 960    0    1    2    0    5    6    3    1    2]
 [   0 1112    3    1    0    1    5    1   12    0]
 [   8    8  920   20    9    5   10   11   37    4]
 [   4    0   17  919    2   22    4   12   21    9]
 [   1    2    5    3  914    0   10    2    7   38]
 [  10    2    0   42   10  769   17    7   28    7]
 [   9    3    7    2    6   20  907    1    3    0]
 [   2    7   22    5    8    1    1  950    5   27]
 [  10   14    5   21   14   27    7   11  853   12]
 [   8    8    2   13   31   14    0   24   12  897]]
precision for logistics regression is:  0.919956947795481  recall:  0.9201  f1 score:  0.919899218277874
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       980
           1       0.96      0.98      0.97      1135
           2       0.94      0.89      0.91      1032
           3       0.89      0.91      0.90      1010
           4       0.92      0.93      0.93   

In [159]:
#determine the best hyperparameters for MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(20,10))

# Create the parameters list you wish to tune
parameters = {'solver' :[ 'sgd', 'adam'],
              'activation': ['tanh', 'relu'],
              'alpha': [0.0001, 0.05]}


# Make an fbeta_score scoring object
scorer = make_scorer(f1_score,average='weighted')

# Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(std_Xtrain, std_Ytrain)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(std_Xtrain, std_Ytrain)).predict(std_Xtest)
best_predictions = best_clf.predict(std_Xtest)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "f1  score on testing data: {:.4f}".format(f1_score(std_Ytest, predictions,average='weighted'))
print "\nOptimized Model\n------"
print "Final f1 score on the testing data: {:.4f}".format(f1_score(std_Ytest, best_predictions,average='weighted'))
print "Final precision score on the testing data: {:.4f}".format(precision_score(std_Ytest, best_predictions,average='weighted'))
print "Final recall score on the testing data: {:.4f}".format(recall_score(std_Ytest, best_predictions,average='weighted'))
print "confusion matrix: "
print (confusion_matrix(y_test, svm_clf.predict(x_test)))

print "parameters", best_clf.get_params

Unoptimized model
------
f1  score on testing data: 0.9513

Optimized Model
------
Final f1 score on the testing data: 0.9587
Final precision score on the testing data: 0.9589
Final recall score on the testing data: 0.9587
confusion matrix: 
[[265   0   0   0   0   1   0   0   1   0]
 [  0 264   1   1   0   0   0   1   0   0]
 [  1   0 256   3   2   0   1   1   3   0]
 [  0   0   1 257   0   4   0   1   2   2]
 [  0   0   0   0 258   0   3   0   0   6]
 [  0   0   1   2   0 261   1   0   2   0]
 [  4   1   1   0   3   3 255   0   0   0]
 [  1   0   6   1   1   0   0 258   0   0]
 [  0   1   2   3   0   4   2   0 254   1]
 [  1   2   0   5   5   2   0   2   3 247]]
parameters <bound method MLPClassifier.get_params of MLPClassifier(activation='relu', alpha=0.05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_

In [224]:
# Neural networks with default values
mlp = MLPClassifier(hidden_layer_sizes=(20,10))
M = mlp.fit(std_Xtrain,Ytrain)

M_mat = confusion_matrix(Ytest, M.predict(std_Xtest))
M_precision = precision_score(Ytest, M.predict(std_Xtest),average='weighted')
M_recall = recall_score(Ytest, M.predict(std_Xtest),average='weighted')
M_f1 = f1_score(Ytest, M.predict(std_Xtest),average='weighted')

print "confusion matrix for Neural networks is: ", M_mat
print "precision for Neural networks is: ", M_precision, " recall: ", M_recall, " f1 score: ", M_f1

print (classification_report(Ytest, M.predict(std_Xtest), labels=labels))

confusion matrix for Neural networks is:  [[ 957    1    2    0    4    3    5    1    1    6]
 [   0 1114    4    2    1    2    3    3    6    0]
 [   3    6  976   13    6    2    2   10   14    0]
 [   5    4    8  948    0   12    0   14   12    7]
 [   2    0    5    1  911    3   23    9    3   25]
 [   5    2    0   18    6  828   11    1   11   10]
 [   8    3    8    0    7   11  911    1    9    0]
 [   3    4   12    4    6    0    0  975   10   14]
 [   5    1    5   11    7   10   11    6  909    9]
 [   5    3    0    6   19    7    1   15    4  949]]
precision for Neural networks is:  0.9478017330604122  recall:  0.9478  f1 score:  0.9477749087903768
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       980
           1       0.98      0.98      0.98      1135
           2       0.96      0.95      0.95      1032
           3       0.95      0.94      0.94      1010
           4       0.94      0.93      0.93       982
