# Objective

Use a Random Forest to fit a classification model to hand-written digits (MNIST).

Data: https://www.kaggle.com/c/digit-recognizer/data

# Setup

Import the relevant packages.

In [None]:
import os
import pandas as pd
import numpy as np
import random

# packages for modelling and evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, plot_confusion_matrix #needs to be version >=0.24.2

# packages for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# set a fixed random state to produce the same results
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

# Read Data

The data are given in a csv table. The first column is the label, the other coloumns are the pixels of the flattened image. 

In [None]:
path = "/work/ka1176/shared_data/training/MNIST"
data = pd.read_csv(os.path.join(path, "train.csv"))
data.head()

# Generate Train and Validation Set

Use the ```train_test_split``` method from the ```sklearn``` package (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).

In [None]:
# save labels and samples separately
X = np.array(data.iloc[:,1:])
y = np.array(data.iloc[:,0])
print(f'Data shape: X: {X.shape}, y: {y.shape}')

In [None]:
# generate train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_STATE)
print(f'Data shape after splitting:')
print(f'Train Dataset: X: {X_train.shape}, y: {y_train.shape}')
print(f'Validation Dataset: X: {X_valid.shape}, y: {y_valid.shape}')

# Visualize the Data

In [None]:
X_image = X_train.reshape(X_train.shape[0], 28, 28)

fig, axes = plt.subplots(2,10, figsize=(20,5))

for i, ax in enumerate(axes.flatten()):
    ax.imshow(X_image[i].reshape(28,28), cmap='gray')
    ax.axis('off')
    ax.set_title(f'Label = {y_train[i]}')

In [None]:
# plot number of each class to see if the dataset is balanced
sns.countplot(y_train);

# Predictive Modelling

Use the random forest classifier from the ```sklearn``` package to fit a model. See https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html for more details.

In [None]:
# how many "estimators" (decision trees) should be used 
n_estimators = 100
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_valid)

# Evaluation

The classification report gives a summary of the most common metrics for each class and the overall accuracy. The confusion matrix visualizes how good your model performs. See https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html and
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html for more details.

In [None]:
print ("Classification Report")
print(classification_report(y_valid, y_pred))

In [None]:
print ("Confusion Matrix")
plot_confusion_matrix(rf, X_valid, y_valid,
                      cmap=plt.cm.Blues,values_format="d")
plt.show()

# Plot Predictions

Visualize the model results with some example plots from the validation set.

In [None]:
fig, axes = plt.subplots(4,10, figsize=(20,10))

for i, ax in enumerate(axes.flatten()):
    ax.imshow(X_valid[i].reshape(28, 28), cmap="gray")
    ax.axis('off')
    ax.set_title(f"Pred: {y_pred[i]} - True: {y_valid[i]}",
                color=("green" if y_pred[i]==y_valid[i] else "red"))