In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
print(mnist.keys())

In [None]:
X, y = mnist["data"], mnist["target"]
# chang y type from string to int
y = y.astype(np.uint8)

In [None]:
X.shape

In [None]:
y.shape

#### Let's plot an instance's feature vector

In [None]:
digit = X[0]
digit_image = digit.reshape(28, 28)
y[0]

In [None]:
plt.imshow(digit_image, cmap=mpl.cm.binary, interpolation="nearest")
plt.axis("off")

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

### 5-detector binary classifier

In [None]:
y_train_5 = (y_train==5)
y_test_5 = (y_test==5)

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_clf = SGDClassifier(random_state=42)

In [None]:
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([digit])

#### Performance measures: Accuracy (Not great for classifiers)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

#### Performance measures: Confusion Matrix - Precision, Recall and F1 score

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_train_5, y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))
print(f1_score(y_train_5, y_train_pred))

#### How to decide which threshold to use?

In [None]:
# 1) Get the scores of all instances in the training set
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

In [None]:
# 2) Compute precision and recall for all possible thresholds
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    idx = np.where(thresholds>=8000)[0][0]
    plt.plot(thresholds[idx], precisions[idx], '.r', markersize=10)
    plt.plot(thresholds[idx], recalls[idx], '.r', markersize=10)
    plt.hlines(y=precisions[idx], xmin=thresholds[0], xmax=thresholds[idx], colors="red", linestyles="dashed")
    plt.hlines(y=recalls[idx], xmin=thresholds[0], xmax=thresholds[idx], colors="red", linestyles="dashed")
    plt.vlines(x=thresholds[idx], ymin=0, ymax=precisions[idx], colors="red", linestyles="dashed")
    plt.grid(True)
    plt.xlabel("Threshold")
    plt.legend(["Precision", "Recall"])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)