In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
# %matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)



In [None]:
import os
import joblib

modelPath = "cache\\Chap3"


def fit_load(nameOfModel,  path=modelPath, **kwargs):
    """
    fit_load : Fit the model on the training set
                or load the .pkl file of the saved model.

    Args:
        nameOfModel (String): Name of the model variable as the string
        path (String, optional): Path of the .pkl file. Defaults to modelPath.

    Returns:
        Model: sklearn fitted model
    """
    os.makedirs(path, exist_ok=True)

    model = eval(nameOfModel)
    filePath = os.path.join(path, nameOfModel+".pkl")

    if os.path.isfile(filePath):

        # loading the model
        model = joblib.load(filePath)

    else:
        model.fit(**kwargs)

        # saving the model
        joblib.dump(model, filePath)

    return model


In [None]:
import os
import joblib

modelPath = "cache\\Chap3"


def cross_val_function(nameOfFile,  crossValFunction, path=modelPath, **kwargs):
    """
    cross_val_function : Evaluate a parameter on a model by cross-validation.

    Args:
        nameOfFile (String): Name of file to save
        crossValFunction (Model): Type of cross-validation model
        path (String, optional): Path of the .pkl file. Defaults to modelPath.

    Returns:
        model: sklearn model using cross-validation
    """
    os.makedirs(path, exist_ok=True)

    filePath = os.path.join(path, nameOfFile+".pkl")

    if os.path.isfile(filePath):

        # loading the model
        model = joblib.load(filePath)

    else:
        model = crossValFunction(**kwargs)

        # saving the model
        joblib.dump(model, filePath)

    return model


# MNIST

In [None]:

from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()


In [None]:
X, y = mnist["data"], mnist["target"]
X.shape, y.shape


In [None]:
28 * 28

In [None]:

import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")

save_fig("some_digit_plot")
plt.show()

In [None]:

y[0]

In [None]:
y = y.astype(np.uint8)

In [None]:
def plot_digit(data):
    """Plot a digit."""
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Training a Binary Classifier

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
y_train_5

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)


sgd_clf = fit_load("sgd_clf", X=X_train, y= y_train_5)
sgd_clf


In [None]:
sgd_clf.predict([some_digit])


# Performance Measures

## Measuring Accuracy Using Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
plt.yticks(np.arange(0, 11, 1))
for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)

    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]

    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))


In [None]:
from sklearn.model_selection import cross_val_score

cross_val_sgd = cross_val_function("cross_val_sgd",
                                   cross_val_score,
                                   estimator=sgd_clf,
                                   X=X_train,
                                   y=y_train_5,
                                   cv=3,
                                   scoring="accuracy")


In [None]:
from sklearn.base import BaseEstimator


class Never5Classifier(BaseEstimator):
    """This class always predict the output as zero."""

    def fit(self, X, y=None):
        """Fit nothing."""
        pass

    def predict(self, X):
        """Return zeros."""
        return np.zeros((len(X), 1), dtype=bool)


never_5_clf = Never5Classifier()
cross_val_function("cross_val_never_5",
                   cross_val_score,
                   estimator=never_5_clf,
                   X=X_train,
                   y=y_train_5,
                   cv=3,
                   scoring="accuracy")


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
plt.yticks(np.arange(0, 11, 1))
for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)

    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]

    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)

    # Evaluation with true 5s
    indexOf5s = np.where(np.array(y_test_fold) == True)

    prediction_for_5s = y_pred[indexOf5s]
    correct_5s = prediction_for_5s[prediction_for_5s == True]

    print(len(correct_5s)/len(prediction_for_5s))


## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_function("y_train_pred",
                                  cross_val_predict,
                                  estimator=sgd_clf,
                                  X=X_train,
                                  y=y_train_5,
                                  cv=3)

confusion_matrix(y_train_5, y_train_pred)


In [None]:
y_train_perfect_predictions = y_train_5  # pretend we reached perfection
confusion_matrix(y_train_5, y_train_perfect_predictions)


## Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred), recall_score(y_train_5, y_train_pred)


In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

## Precision/Recall Trade-off

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
threshold = 8000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred


In [None]:
y_scores = cross_val_function("y_scores",
                              cross_val_predict,
                              estimator=sgd_clf,
                              X=X_train,
                              y=y_train_5,
                              cv=3,
                              method="decision_function")
y_scores


In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)


In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    """Plot precision and and recall graph according to threshold."""
    plt.plot(thresholds, precisions[:-1], "b--",
             label="Precision", linewidth=2)
    # -1 is for that the number thresholds are one lesser than the length of precision.

    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16)  # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown


# Finding the value of recall with 90 percent precision
# True value is more than False value. The first True is the maximum.
recall_90_precision = recalls[np.argmax(precisions >= 0.90)]

# Finding the value of threshold with 90 percent precision
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]


plt.figure(figsize=(8, 4))
# Plotting precision and recall graphs according to threshold
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

# Drawing the vertical red line
plt.plot([threshold_90_precision, threshold_90_precision],
         [0., 0.9], "r:")

# Drawing the horizontal red line at 90 percent precision
plt.plot([-50000, threshold_90_precision], [0.9, 0.9],
         "r:")

# Drawing the horizontal red line at 90 percent precision on the recall graph
plt.plot([-50000, threshold_90_precision],
         [recall_90_precision, recall_90_precision], "r:")

# Pointing a red dot at the 90 percent of preceision graph
plt.plot([threshold_90_precision], [0.9], "ro")

# Pointing a red dot at the 90 percent of preceision on the recall graph
plt.plot([threshold_90_precision], [recall_90_precision],
         "ro")

save_fig("precision_recall_vs_threshold_plot")
plt.show()


In [None]:

def plot_precision_vs_recall(precisions, recalls):
    """Plot precision according to recall axis."""
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)


plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)

# Drawing a vertical red line
plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], "r:")

# Drawing a horizontal red line
plt.plot([0.0, recall_90_precision], [0.9, 0.9], "r:")

# Pointing a red dot at 90 percent precision
plt.plot([recall_90_precision], [0.9], "ro")

save_fig("precision_vs_recall_plot")
plt.show()


In [None]:
recalls, precisions


In [None]:
np.max(recalls * precisions)

In [None]:
thresholds

In [None]:
precisions >= 0.90

In [None]:
np.argmax(precisions >= 0.90)

In [None]:
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
threshold_90_precision


In [None]:
y_scores

In [None]:
y_train_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

## The ROC Curves

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)


In [None]:
import mpld3
mpld3.enable_notebook()


def plot_roc_curve(fpr, tpr, label=None):
    """Plot ROC curve."""
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')  # dashed diagonal
    # Not shown in the book
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16)  # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown


plt.figure(figsize=(8, 6))                                    # Not shown
plot_roc_curve(fpr, tpr)

# Finding the fpr that the tpr has at least 90 percent precision.
fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]           # Not shown

# Drawing vertical line
plt.plot([fpr_90, fpr_90], [0., recall_90_precision], "r:")   # Not shown

# Drawing horizontal line
plt.plot([0.0, fpr_90], [recall_90_precision,
         recall_90_precision], "r:")  # Not shown

# Pointing the dot
plt.plot([fpr_90], [recall_90_precision], "ro")               # Not shown

save_fig("roc_curve_plot")                                    # Not shown
plt.show()


In [None]:
len(tpr)

In [None]:
len(recalls)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)


In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_function("y_probas_forest",
                                     cross_val_predict,
                                     estimator=forest_clf,
                                     X=X_train,
                                     y=y_train_5,
                                     cv=3,
                                     method="predict_proba")

y_probas_forest


In [None]:
y_scores_forest = y_probas_forest[:, 1]  # score = proba of positive class
y_scores_forest


In [None]:
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

In [None]:
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")
plt.show()


In [None]:
roc_auc_score(y_train_5, y_scores_forest)

In [None]:
from sklearn.metrics import precision_score, recall_score


y_train_pred_forest = cross_val_function("y_train_pred_forest",
                                         cross_val_predict,
                                         estimator=forest_clf,
                                         X=X_train,
                                         y=y_train_5,
                                         cv=3)

y_train_pred_forest


In [None]:
print(precision_score(y_train_5, y_train_pred_forest))
print(recall_score(y_train_5, y_train_pred_forest))

#

# Multicalss Classification

In [None]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf = fit_load("svm_clf", X=X_train, y=y_train)
# svm_clf.fit(X_train, y_train)  # y_train, not y_train_5
svm_clf.predict([some_digit])


In [None]:
some_digit_scores = svm_clf.decision_function([some_digit])
some_digit_scores


In [None]:
np.argmax(some_digit_scores)


In [None]:
svm_clf.classes_

In [None]:
svm_clf.classes_[5]

In [None]:
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC())
ovr_clf = fit_load("ovr_clf", X=X_train, y=y_train)
ovr_clf.predict([some_digit])


In [None]:
len(ovr_clf.estimators_)


In [None]:
sgd_clf_multiclass = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf_multiclass = fit_load("sgd_clf_multiclass", X=X_train, y=y_train)
sgd_clf_multiclass.predict([some_digit])


In [None]:
sgd_clf_multiclass.decision_function([some_digit])


In [None]:
sgd_clf_multiclass = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
cross_val_function("sgd_clf_mul_cross_val",
                   cross_val_score,
                   estimator=sgd_clf_multiclass,
                   X=X_train,
                   y=y_train,
                   cv=3,
                   scoring="accuracy")


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

cross_val_function("sgd_clf_mult_cross_val_scaled",
                   cross_val_score,
                   estimator=sgd_clf,
                   X=X_train_scaled,
                   y=y_train,
                   cv=3,
                   scoring="accuracy")


# Error Analysis

In [None]:
y_train_pred = cross_val_function("sgd_clf_mult_scaled_cross_pred",
                                  cross_val_predict,
                                  estimator=sgd_clf,
                                  X=X_train_scaled,
                                  y=y_train,
                                  cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx


In [None]:
def plot_matrix(matrix):
    """Plot confusion matrix with colerbar."""
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)
    return fig


In [None]:
plot_matrix(conf_mx)
save_fig("confusion_matrix_plot", tight_layout=False)
plt.xticks(np.arange(0, 10, 1))
plt.yticks(np.arange(0, 10, 1))
plt.show()


In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
row_sums

In [None]:
norm_conf_mx = conf_mx / row_sums
norm_conf_mx

In [None]:
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.jet)
save_fig("confusion_matrix_errors_plot", tight_layout=False)
plt.colorbar()
plt.show()


In [None]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    """Plot digits in a square format."""
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:

cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
save_fig("error_analysis_digits_plot")
plt.show()

# Multilabel Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
"""
numpy.c_ :
    numpy.c_ = <numpy.lib.index_tricks.CClass object>
    Translates slice objects to concatenation along the second axis.
"""
y_multilabel = np.c_[y_train_large, y_train_odd]
y_multilabel


In [None]:
knn_clf = KNeighborsClassifier()
knn_clf = fit_load("knn_clf", X=X_train, y=y_multilabel)


In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_function("y_train_knn_pred",
                                      cross_val_predict,
                                      estimator=knn_clf,
                                      X=X_train,
                                      y=y_multilabel,
                                      cv=3)


In [None]:
f1_score(y_multilabel, y_train_knn_pred, average="macro")

# Multioutput Classification

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test


In [None]:
some_index = 0
plt.subplot(121)
plot_digit(X_test_mod[some_index])
plt.subplot(122)
plot_digit(y_test_mod[some_index])
save_fig("noisy_digit_example_plot")
plt.show()


In [None]:
knn_clf_noise_cancelation = KNeighborsClassifier()
knn_clf_noise_cancelation = fit_load("knn_clf_noise_cancelation",
                                     X=X_train_mod,
                                     y=y_train_mod)
clean_digit = knn_clf_noise_cancelation.predict([X_test_mod[some_index]])
plot_digit(clean_digit)


# KNN Classifier Optimization

In [None]:
from scipy.ndimage.interpolation import shift


def shift_digit(digit_array, dx, dy, new=0):
    """Shift the digit on the image."""
    return shift(digit_array.reshape(28, 28), [dy, dx], cval=new).reshape(-1)


plot_digit(shift_digit(some_digit, 5, 1, new=100))


In [None]:
import os
import joblib

modelPath = "cache\\Chap3"


os.makedirs(modelPath, exist_ok=True)

X_train_expanded_path = os.path.join(modelPath, "X_train_expanded.pkl")
y_train_expanded_path = os.path.join(modelPath, "y_train_expanded.pkl")


X_train_expanded = [X_train]
y_train_expanded = [y_train]


if os.path.isfile(X_train_expanded_path) and os.path.isfile(y_train_expanded_path):
    X_train_expanded = joblib.load(X_train_expanded_path)
    y_train_expanded = joblib.load(y_train_expanded_path)

else:
    for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
        shifted_images = np.apply_along_axis(
            shift_digit, axis=1, arr=X_train, dx=dx, dy=dy)
        X_train_expanded.append(shifted_images)
        y_train_expanded.append(y_train)

        joblib.dump(X_train_expanded, X_train_expanded_path)
        joblib.dump(y_train_expanded, y_train_expanded_path)


print(X_train_expanded)
print(y_train_expanded)


In [None]:
X_train_expanded = np.concatenate(X_train_expanded)
y_train_expanded = np.concatenate(y_train_expanded)
X_train_expanded.shape, y_train_expanded.shape


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf_optimized = KNeighborsClassifier(weights='distance', n_neighbors=4)
knn_clf_optimized = fit_load("knn_clf_optimized",
                             X=X_train_expanded,
                             y=y_train_expanded)


In [None]:

import os
import joblib

modelPath = "cache\\Chap3"


os.makedirs(modelPath, exist_ok=True)

y_knn_expanded_pred_path = os.path.join(
    modelPath, "y_knn_expanded_pred_path.pkl")


if os.path.isfile(y_knn_expanded_pred_path):
    y_knn_expanded_pred = joblib.load(y_knn_expanded_pred_path)

else:
    y_knn_expanded_pred = knn_clf_optimized.predict(X_test)

    joblib.dump(y_knn_expanded_pred, y_knn_expanded_pred_path)




In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_knn_expanded_pred)


In [None]:
ambiguous_digit = X_test[2589]
knn_clf_optimized.predict_proba([ambiguous_digit])

In [None]:
plot_digit(ambiguous_digit)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf_optimized = KNeighborsClassifier(n_neighbors=4, weights='distance')

knn_clf_optimized_cross_predict = cross_val_function("knn_clf_optimized_cross_predict",
                                                     cross_val_predict,
                                                     estimator=knn_clf_optimized,
                                                     X=X_train_expanded,
                                                     y=y_train_expanded,
                                                     cv=6,
                                                     verbose=2)


In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train_expanded, knn_clf_optimized_cross_predict)
