# MNIST

The objective here is to correctly identify digits from a dataset of tens of thousands of handwritten images. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Loading Data

In [2]:
mnist = pd.read_csv('datasets/mnist/train.csv')
mnist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


# Decision Tree

In [3]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(mnist, mnist['label'], test_size=0.2, random_state=42)
train_X = train_X.drop(["label"],axis=1)
test_X = test_X.drop(["label"],axis=1)

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(train_X, train_Y) 
y_pred = tree_classifier.predict(test_X)
print ("Accuracy on Training: ",sum(y_pred==test_Y)/len(test_Y))

Accuracy on Training:  0.852261904762


# Logistic Regression

In [13]:
from sklearn.model_selection import train_test_split

sample = mnist.head(500)
train_X, test_X, train_Y, test_Y = train_test_split(sample, sample['label'], test_size=0.2, random_state=42)
train_X = train_X.drop(["label"],axis=1)
test_X = test_X.drop(["label"],axis=1)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(train_X, train_Y)
y_pred = log_reg.predict(test_X)
print("Accuracy on Training: ", sum(y_pred==test_Y)/len(test_Y))

Accuracy on Training:  0.88


# Support Vector Machines

In [17]:
from sklearn import svm
from sklearn.cross_validation import train_test_split

sample = mnist.head(700)
X_train, X_test, Y_train, Y_test = train_test_split(sample,sample['label'],random_state = 42,
                                                    test_size = 0.1)

classifier = svm.SVC(C=2.0)
classifier.fit(X_train, Y_train)
score = classifier.score(X_test, Y_test)
print("SVM score: ", score)

SVM score:  0.114285714286


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

X_train, X_test, Y_train, Y_test = train_test_split(mnist,mnist['label'],random_state = 42, test_size = 0.1)

svm_clf = Pipeline((
("scaler", StandardScaler()),
("linear_svc", LinearSVC(C=1, loss="hinge")),
))
svm_clf.fit(X_train, Y_train)
y_pred = svm_clf.predict(X_test)
print("Accuracy on Training: ", sum(y_pred==Y_test)/len(Y_test))

Accuracy on Training:  0.951904761905


In [9]:
from sklearn import linear_model
sgd_clf = linear_model.SGDClassifier(loss="hinge")
sgd_clf.fit(X_train, Y_train)
y_pred = svm_clf.predict(X_test)
print("Accuracy on Training: ", sum(y_pred==Y_test)/len(Y_test))



Accuracy on Training:  0.951904761905
