In [2]:
#import all needed packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import scale
from sklearn import metrics

#import data as data frames
train_dataFrame = pd.read_csv("train.csv")

#Here in case there is an extra test set, but the provided test set does not come with labels
frames = [train_dataFrame]
combinedDataFrame = pd.concat(frames)

In [102]:
combinedDataFrame.dropna()
combinedDataFrame.shape

(42000, 785)

In [103]:
## Set up the label variable as y
y = combinedDataFrame['label']

## Dropping feature 'label'
X = combinedDataFrame.drop(columns = 'label')

In [104]:
# Normalize the set (between 0 and 1)
X = X/255.0
# Scale the set
X_scaled = scale(X)

In [105]:
# Split the set into randomized training and test feature/label sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, train_size = 0.8 ,random_state = 10)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(33600, 784)
(8400, 784)
(33600,)
(8400,)


In [106]:
# Linear model using One versus all (shape is ovr, same thing in sklearn)

linearModel = SVC(kernel='linear', decision_function_shape='ovr')
linearModel.fit(X_train, y_train)

# Prediction
labelPrediction = linearModel.predict(X_test)

# Print Confusion Matrix and Accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=labelPrediction), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=labelPrediction))

accuracy: 0.9188095238095239 

[[774   0   0   0   5   7   7   1   2   0]
 [  0 930   2   2   1   1   0   3   7   0]
 [ 10  14 778  16   6   6   6   6  11   3]
 [  6   2  25 776   0  29   1   6  10   1]
 [  2   1  11   3 783   2   2   2   0  20]
 [  5   8  11  30   4 675  11   1  19   9]
 [ 10   1  14   0   8  11 743   0   0   0]
 [  3   3  12   4  13   0   0 818   1  15]
 [  5  21  20  40   4  36   8   2 692   8]
 [  3   3   5  14  32   6   1  35   7 749]]


In [107]:
# RBF Kernel, C=1, default value of gamma

# Create Model
rbfKernelModel = SVC(C=10, gamma=0.001, kernel='rbf')
# Train Model
rbfKernelModel.fit(X_train, y_train)
# Create label prediction
labelPrediction = rbfKernelModel.predict(X_test)

#Print accuracy and confusion matrix
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=labelPrediction), "\n")
print(metrics.confusion_matrix(y_true=y_test, y_pred=labelPrediction))

accuracy: 0.9678571428571429 

[[781   0   3   1   0   1   4   3   3   0]
 [  0 934   4   2   2   0   1   3   0   0]
 [  4   3 828   4   3   1   0  10   3   0]
 [  1   0   8 821   0  11   0  11   3   1]
 [  1   1   2   0 807   1   4   3   1   6]
 [  0   0   1  10   1 745   4   5   6   1]
 [  5   1   2   0   2   2 771   4   0   0]
 [  1   2   6   2   4   0   1 845   0   8]
 [  3   5   9   7   5   7   5   3 787   5]
 [  1   0   1  11   7   2   1  19   2 811]]
