In [None]:
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Separating targets and features

`X_train` contains the features: all 784 pixels from the images in an array

`Y_train` contains the targets: the numerical labels from 1-9

In [None]:
Y_train = train['label']
X_train = train.drop(columns=['label'])

# Distribution

Data is more or less evenly distributed. Category 5 contains some 1000 elements less than Category 1 but it's still quite a lot of training examples.

In [None]:
Y_train.value_counts()

In [None]:
Y_train.hist()
plt.show()

# Data preparation

1. Transform our data from 0-255 pixel value to 0-1 pixel value
2. Transform our labels from numerical values to one-hot encoded values

In [None]:
X_train = X_train / 255.0
test = test / 255.0

In [None]:
X_train.values.shape

In [None]:
Y_train = keras.utils.to_categorical(Y_train, num_classes=10)

In [None]:
Y_train[:5]

# Splitting our data

We use a 80/20 test split since there's quite a lot of data and we just want to learn on as much as possible. Random state of The One True Number so we can safely repeat the split.

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.4, random_state=42)

In [None]:
X_train = X_train.values # This is why I absolutely detest pandas. 3 hours of debugging, oh it's a dataframe haha.
print(X_train.shape)
print(Y_train.shape)

# Images of our data

Which ones could the model have trouble with?

In [None]:
plt.figure(figsize=(10, 4))
for i in range(30):  
    plt.subplot(3, 10, i+1)
    plt.imshow(X_train[i].reshape((28,28)))
    plt.axis('off')
plt.show()

# Building our network

We use 3Blue1Brown's model here. Input layer of 784 neurons (1 per pixel), ReLu activation, then two layers of 16 neurons with ReLu activation and finally a SoftMax layer.

In [None]:
model = keras.models.Sequential()

model.add(keras.layers.Dense(32, activation='relu', input_shape=(784,)))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dense(10, activation='softmax'))

# Compiling the model

Adam optimizer is generally the best, but every year new ones come out so challenge it!

Loss and metrics are hopefully obvious.

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.summary()

# Training the model

Generally 5 epochs is enough to do better than most humans. Let's do 50 in the lecture as well to see the difference.

In [None]:
model.fit(
    X_train,
    Y_train,
    batch_size=32,
    epochs=50,
    validation_data=(X_val, Y_val)
)

# The predictions

Unfortunately, the `predict` method just returns the SoftMax layers values, so we still need to retrieve the actual prediction from that. `np.argmax` does that: it retrieves the highest number from each array. Parameter `axis=1` makes it return the index from that array so actually get the number.

In [None]:
predictions = model.predict(test.values)

print(predictions[:5])

results = np.argmax(predictions ,axis = 1)

print(results[:5])

In [None]:
my_submission = pd.DataFrame({'ImageId': list(range(1,len(results)+1)), 'label': results})

# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)