# SVM on the Digits Dataset SOL

### Imports and Prep

In [None]:
from __future__ import print_function

In [None]:
import numpy as np
import pandas as pd

from IPython.display import Image

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn import svm

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
digits = datasets.load_digits()
print(digits.data.shape) 

In [None]:
plt.gray()
plt.matshow(digits.images[166])
plt.show()

In [None]:
Image(filename='digits.png') 

In [None]:
X_centered = digits.data - digits.data.mean()
y = digits.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_centered, y, test_size=0.5,random_state=42)

In [None]:
pd.Series(y_train).value_counts()

# Principle Component Analysis

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)

In [None]:
pcafeatures_train = pca.transform(X_train)

In [None]:
from itertools import cycle

def plot_PCA_2D(data, target, target_names):
    colors = cycle(['r','g','b','c','m','y','orange','w','aqua','yellow'])
    target_ids = range(len(target_names))
    plt.figure()
    for i, c, label in zip(target_ids, colors, target_names):
        plt.scatter(data[target == i, 0], data[target == i, 1],
                   c=c, label=label)
    plt.legend()

In [None]:
plot_PCA_2D(pcafeatures_train, target=y_train, target_names=digits.target_names)

# Fitting Linear and RBF SVM Models

In [None]:
# fit linear model
model_svm = svm.SVC(kernel='linear')
model_svm.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred = model_svm.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred)
# pretty good accuracy!

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred)

In [None]:
# fit rbf model
model_svm2 = svm.SVC(kernel='rbf', gamma = 0.001)
model_svm2.fit(X_train, y_train)

In [None]:
# predict out of sample
y_pred2 = model_svm2.predict(X_test)

In [None]:
# check accuracy
accuracy_score(y_test,y_pred2)
# our accuracy improved!

In [None]:
# confusion matrix
confusion_matrix(y_test,y_pred2)