# MNIST

The objective here is to correctly identify digits from a dataset of tens of thousands of handwritten images. 

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Loading Data

In [3]:
mnist = pd.read_csv('datasets/mnist/train.csv')
mnist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [4]:
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Decision Tree

In [6]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(mnist, mnist['label'], test_size=0.2, random_state=42)
train_X = train_X.drop(["label"],axis=1)
test_X = test_X.drop(["label"],axis=1)

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(train_X, train_Y) 
y_pred = tree_classifier.predict(test_X)
print ("Accuracy on Training: ",sum(y_pred==test_Y)/len(test_Y))

Accuracy on Training:  0.851785714286


# Logistic Regression

In [7]:
from sklearn.model_selection import train_test_split

sample = mnist.head(500)
train_X, test_X, train_Y, test_Y = train_test_split(sample, sample['label'], test_size=0.2, random_state=42)
train_X = train_X.drop(["label"],axis=1)
test_X = test_X.drop(["label"],axis=1)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_X, train_Y)
y_pred = log_reg.predict(test_X)
print("Accuracy on Training: ", sum(y_pred==test_Y)/len(test_Y))

Accuracy on Training:  0.88


# Support Vector Machines

In [8]:
from sklearn import svm
from sklearn.cross_validation import train_test_split

sample = mnist.head(700)
X_train, X_test, Y_train, Y_test = train_test_split(sample,sample['label'],random_state = 42,
                                                    test_size = 0.1)

classifier = svm.SVC(C=2.0)
classifier.fit(X_train, Y_train)
score = classifier.score(X_test, Y_test)
print("SVM score: ", score)



SVM score:  0.114285714286


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

X_train, X_test, Y_train, Y_test = train_test_split(mnist,mnist['label'],random_state = 42, test_size = 0.1)

svm_clf = Pipeline((
("scaler", StandardScaler()),
("linear_svc", LinearSVC(C=1, loss="hinge")),
))
X_train = X_train.drop(["label"], axis=1)
svm_clf.fit(X_train, Y_train)
X_test = X_test.drop(["label"], axis=1)
y_pred = svm_clf.predict(X_test)
print("Accuracy on Training: ", sum(y_pred==Y_test)/len(Y_test))

Accuracy on Training:  0.89880952381


In [13]:
X_test.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
5457,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38509,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25536,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31803,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39863,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test = pd.read_csv('datasets/mnist/test.csv')
test.head()
predictions = svm_clf.predict(test)
image_id = np.arange(1,predictions.shape[0]+1)
pd.DataFrame({"ImageId": image_id, "Label": predictions}).to_csv('datasets/mnist/submission.csv', 
                                                                      index=False, header=True)

In [9]:
from sklearn import linear_model
sgd_clf = linear_model.SGDClassifier(loss="hinge")
sgd_clf.fit(X_train, Y_train)
y_pred = svm_clf.predict(X_test)
print("Accuracy on Training: ", sum(y_pred==Y_test)/len(Y_test))



Accuracy on Training:  0.951904761905
