In [None]:
# load dataset

from sklearn.datasets import fetch_mldata

mnist = fetch_mldata("MNIST original")
mnist

In [None]:
# split into train and test

X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
# shuffle the train dataset

import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
# binary classifier

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

sgd_clf.predict(X[84])

In [None]:
# cross validation

from sklearn.model_selection import cross_val_score

sgd_scores = cross_val_score(sgd_clf, X_train, y_train_5, cv =3, scoring="accuracy")

In [None]:
# becausse accuracy is not a good performance measure for classifiers
# we use a confusion matrix

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv =3)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_5, y_train_pred)

In [None]:
recall_score(y_train_5, y_train_pred)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

In [None]:
# changing the threshold to achieve greater precision at recall expense
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv =3, method="decision_function")

# let;s show the curve
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="center left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])

plt.show()

In [None]:
# roc curve
def plot_roc_curve(fpr, tpr, **options):
    plt.plot(fpr, tpr, linewidth=2, **options)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

In [None]:
# random forest classifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

roc_auc_score(y_train_5, y_scores)
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, label="Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
# train Stochastic Gradient Descent on all numbers

sgd_clf.fit(X_train, y_train)
some_digit_scores = sgd_clf.decision_function(X[1])
some_digit_scores

In [None]:
# train the Random Forest on all numbers
forest_clf.fit(X_train, y_train)

In [None]:
# SGD performance

sgd_scores = cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring="accuracy")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_train_scaled
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")