# <span style = "color:red"> MNIST </span>

In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')



In [2]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [3]:
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary, interpolation = "nearest")
plt.axis("off")
plt.show()

print(y[36000])

<Figure size 640x480 with 1 Axes>

9


In [4]:
import numpy as np

X_train, y_train = X[:60000], y[:60000]
X_test, y_test = X[60000:], y[60000:]
y_train = y_train.astype(np.int8) 

In [5]:
#Shuffle the dataset

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

## <span style = "color:blue"> Training a Binary Classifier</span>

In [6]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [13]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
sgd_clf.fit(X_train, y_train_5)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=-inf, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [14]:
sgd_clf.predict([some_digit])

array([False])

## <span style = "color:blue"> Measuring Accuracy Using Cross-Validation </span>

In [15]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring = "accuracy")

#This happened because just 10% of images are 5's, so even if you guess that all images were not a 5, you will be right almost 90%
#Some classes are much more frequent than others

array([0.95825, 0.9437 , 0.955  ])

In [18]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3)

In [19]:
confusion_matrix(y_train_5, y_train_pred)


array([[54307,   272],
       [ 2589,  2832]], dtype=int64)

## <span style = "color:blue"> Confusion Matrix </span>

In [20]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred) #  When a model claims an image of 5, it is correct only: 2832 / (2832 + 272)

0.9123711340206185

In [21]:
recall_score(y_train_5, y_train_pred) # It only detects: 2832 / (2832 + 2589) of 5'sb

0.5224128389596016

In [23]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

0.6643988269794722