# This is the sample code for chapter 3
This chapter focus on simply creating a binary or multiclass classifier. The following code is my solution to the exercises.

## Exercise 1: Create a k-neighbour classifier for MNIST
Requirement: over 97% precision on MNIST. KNeighborsClassifier is effective. Try search for the best parameters.

In [None]:
## Load dataset
import scipy.io as sio
import numpy as np

mnist = sio.loadmat("mnist/mnist-original.mat")
X, y = mnist["data"], mnist["label"]
X_train, X_test = X[:, :60000], X[:, 60000:]
y_train, y_test = y[:, :60000], y[:, 60000:]

X_train = np.transpose(X_train)
X_test = np.transpose(X_test)
y_train = np.transpose(y_train)[:, 0]
y_test = np.transpose(y_test)[:, 0]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# have a look at the digit
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = X[:, 36000]
some_digit_image = some_digit.reshape((28,28))
plt.imshow(some_digit_image, cmap='binary', interpolation='nearest')
plt.axis('off')
plt.show()
y[0, 36000]

In [None]:
# create a classifier to do classification
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(weights="distance", n_neighbors=6)
knc.fit(X_train, y_train)
y_pred = knc.predict(X_test)

# # create grid search to find the best parameters within search field
# from sklearn.model_selection import GridSearchCV
# param_grid = [
#     {"weights": ["uniform", "distance"]},
#     {"n_neighbors": [6, 7, 8, 9, 10, 11, 12]}
# ]
# grid_search = GridSearchCV(knc, param_grid, cv=5, scoring="neg_mean_squared_error")
# grid_search.fit(X_train, y_train)
# weights = grid_search.best_param_["weights"]
# n_neighbors = grid_search.best_param_["n_neighbors"]

# best_knc= grid_search.best_estimator_
# best_knc.fit(X_train, y_train)
# y_pred = best_knc.predict(X_test)

In [None]:
# evaluate the precision of the model
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average="micro")

## Exercise 2: Train set augmentation
Requirement: write a function that moves the digit one pixel up, down, left and right. Use the augmented dataset to train the model again.

In [None]:
from scipy.ndimage import shift

class DirectionError(Exception):
    pass

def move_image(input_image, direction):
    if direction not in ["up", "down", "left", "right"]:
        raise DirectionError("Invalid direction " + direction + ", only **'up, down, left, right'** are allowed.")
    else:
        image = input_image.reshape((28, 28))
        if direction == "up":
            result = shift(image, [-1, 0], cval=0)
        elif direction == "down":
            result = shift(image, [11, 0], cval=0)
        elif direction == "left":
            result = shift(image, [0, -1], cval=0)
        elif direction == "right":
            result = shift(image, [0, 1], cval=0)
        
        result = result.reshape(784)
        return result

In [None]:
# Now use the function to augment train set. Reuse the code cell above.
new_X_train = []
new_y_train = []

for index in range(len(X_train)):
    pic = X_train[index]
    label = y_train[index]
    
    new_X_train.append(pic)
    new_X_train.append(move_image(pic, "up"))
    new_X_train.append(move_image(pic, "down"))
    new_X_train.append(move_image(pic, "left"))
    new_X_train.append(move_image(pic, "right"))
    for i in range(5):
        new_y_train.append(label)

new_X_train = np.array(new_X_train)
new_y_train = np.array(new_y_train)
new_X_train.shape, new_y_train.shape

In [None]:
# Train on the augmented dataset
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(weights="distance", n_neighbors=6)
knc.fit(new_X_train, new_y_train)
y_pred = knc.predict(X_test)

In [None]:
# evaluate the precision of the model
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average="micro")

## Exercise 3: Deal with Titanic dataset on Kaggle

First use pandas to load .csv file

In [1]:
# load Titanic dataset
import pandas as pd
train_set = pd.read_csv("titanic/train.csv")
# train_set.head()

Before training the model, we need to preprocess the data.

In [2]:
# one hot encoder for Sex, Embarked

from sklearn.preprocessing import LabelEncoder

def my_encoder(input_data, tag):
    if tag not in ["Sex", "Embarked"]:
        raise NameError("Invalid tag!")
    else:
        encoder = LabelEncoder()
        my_result = input_data
        tmp_result = encoder.fit_transform(my_result[tag])
        my_result[tag] = tmp_result
        return my_result

In [3]:
# write a convenient function to complete preprocessing
def preprocess_data(input_data, is_train):
    result = input_data
    # drop Cabin, PassengerID, Name, Ticket
    result = result.drop(["PassengerId"], axis=1)
    result = result.drop(["Name"], axis=1)
    result = result.drop(["Cabin"], axis=1)
    result = result.drop(["Ticket"], axis=1)
    # fill NaN value for Age and Embarked.
    result["Age"] = result["Age"].fillna(0)
    result["Embarked"] = result["Embarked"].fillna("N")
    result["Fare"] = result["Fare"].fillna(0.0)
    # encode
    result = my_encoder(result, "Sex")
    result = my_encoder(result, "Embarked")
    if is_train:
        data, label = result.drop(["Survived"], axis=1), result["Survived"]
        data = data.values
        label = label.values
        return data, label
    else:
        data = result
        data = data.values
        return data

When all data are processed, we can train the model. Here I use RandomForestClassifier.

In [4]:
# create a RandomForestClassifier and k_fold evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

X_train, y_train = preprocess_data(train_set, is_train=True)

skfolds = StratifiedKFold(n_splits=5, random_state=42)

model = RandomForestClassifier()

for train_index, val_index in skfolds.split(X_train, y_train):
    X_train_fold = X_train[train_index]
    y_train_fold = y_train[train_index]
    
    X_val_fold = X_train[val_index]
    y_val_fold = y_train[val_index]
    
    sgd_clf = clone(model)
    sgd_clf.fit(X_train_fold, y_train_fold)
    
    y_pred = sgd_clf.predict(X_val_fold)
    print("Validation precision: ", roc_auc_score(y_val_fold, y_pred))
    
model.fit(X_train, y_train)

Validation precision:  0.7850461133069829
Validation precision:  0.8005270092226614
Validation precision:  0.7966577540106954
Validation precision:  0.7451871657754011
Validation precision:  0.8052482460874258




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Finally make predictions with the model on test set and save it as required.

In [6]:
# evaluate the model on test set
test_set = pd.read_csv("titanic/test.csv")
X_test = preprocess_data(test_set, is_train=False)
predictions = model.predict(X_test)
passenger_id = test_set["PassengerId"]
passenger_id = passenger_id.values
# write to .csv file
import csv
with open("my_submission.csv", "w") as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_head = ["PassengerId", "Survived"]
    csv_writer.writerow(csv_head)
    for i in range(len(predictions)):
        content = [passenger_id[i], predictions[i]]
        csv_writer.writerow(content)

Now submit on Kaggle and see how the model performs!

## Exercise 4: Create a spam mail classifier