In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
y = y.astype(np.uint8)

In [5]:
def plot_digit(data, ax):
    image = data.reshape(28, 28)
    ax.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    ax.axis("off")

In [6]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_jobs=-1)

p_grid =  {
    "n_neighbors":[1, 2, 3, 4, 5],
    "weights": ["uniform", "distance"]
}

gscv = GridSearchCV(clf, param_grid=p_grid, cv=3, verbose=2)
gscv.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END .....................n_neighbors=1, weights=uniform; total time=  13.4s
[CV] END .....................n_neighbors=1, weights=uniform; total time=  13.8s
[CV] END .....................n_neighbors=1, weights=uniform; total time=  13.8s
[CV] END ....................n_neighbors=1, weights=distance; total time=  13.2s
[CV] END ....................n_neighbors=1, weights=distance; total time=  13.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=  12.9s
[CV] END .....................n_neighbors=2, weights=uniform; total time=  13.9s
[CV] END .....................n_neighbors=2, weights=uniform; total time=  14.0s
[CV] END .....................n_neighbors=2, weights=uniform; total time=  14.3s
[CV] END ....................n_neighbors=2, weights=distance; total time=  13.8s
[CV] END ....................n_neighbors=2, weights=distance; total time=  13.6s
[CV] END ....................n_neighbors=2, weig

GridSearchCV(cv=3, estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5],
                         'weights': ['uniform', 'distance']},
             verbose=2)

In [14]:
gscv.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [15]:
gscv.best_score_

0.9703500000000002

In [16]:
clf = gscv.best_estimator_

clf.score(X_test, y_test)

0.9714

In [17]:
def shift(img):
    img_2d = img.reshape(28, 28)
    out = []

    for axis in [0, 1]:
        for dir in [-1, 1]:
            out.append(np.roll(img_2d, shift=dir, axis=axis).ravel())

    return out
    

In [18]:
X_train_shift = [img for img in X_train]
y_train_shift = [label for label in y_train]

for X_i, y_i in tqdm(zip(X_train, y_train)):
    X_train_shift.extend(shift(X_i))
    y_train_shift.extend([y_i]*4)

X_train_shift = np.asarray(X_train_shift)
y_train_shift = np.asarray(y_train_shift)

print(X_train_shift.shape)
print(y_train_shift.shape)

0it [00:00, ?it/s]

(300000, 784)
(300000,)


In [19]:
clf.fit(X_train_shift, y_train_shift)

clf.score(X_test, y_test)

0.9763

In [20]:
titanic = fetch_openml(data_id="40945")

In [26]:
titanic.data.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
count,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,39.0,1.0,0.0,31.275,256.0
max,3.0,80.0,8.0,9.0,512.3292,328.0
