In [None]:
# import a package that can plot graphics for us
import matplotlib.pyplot as plt

# import a tool that will help us visualize the data
from ipywidgets import interact

# import the dataset
from sklearn.datasets import load_digits

# import a package that will help us "preprocess" the data (name it understandable to our classifiers)
import numpy as np

Let's load the data and see what it looks like.
We can use the `interact` function to scroll through the images in the data set.

In [None]:
# Load the digits dataset
digits = load_digits()

def display_n(n):
    digit_image = digits.images[n]
    digit_label = digits.target[n]
    
    # Display the n-th digit
    ## create a figure to plot on
    plt.figure(figsize=(5, 5))
    
    ## set the title of the plot
    plt.title("This is a {}".format(digit_label))
    
    ## plot the digit as an image
    plt.imshow(digit_image, cmap=plt.cm.gray_r, interpolation='nearest')
    
    ## remove the numbers on the axes
    plt.xticks([])
    plt.yticks([])
    
    ## show the plot
    plt.show()
    
# create a slider-bar that will let us scroll through and look at the images
interact(display_n, n=(0, digits.images.shape[0] - 1))

We can see that the dataset contains images and a "label" (or target) for each image.
This label tells us that this is an image of a eight (for example).
We will need to use both the images and the labels to train our classifiers.

Let's have a better look how the images are actually represented:

In [None]:
num_images = digits.images.shape[0]
print("There are {} images in the dataset.".format(num_images))

In [None]:
num_dimensions = len(digits.images[0].shape)
print("Each image is stored in a {}-D array.".format(num_dimensions))

In [None]:
example_image_shape = digits.images[0].shape
print("The array the image is stored has a shape of {} (this is the size in each dimension).".format(example_image_shape))

In [None]:
example_image = digits.images[0]
print("The array actually looks like this:")
print(example_image)

In [None]:
min_value = digits.images.min()
print("The minimum value in the images is '{}'.".format(min_value))

In [None]:
max_value = digits.images.max()
print("The maximum value in the images is '{}'.".format(max_value))

We can see that our images are stored in 2-D arrays and that the values in the arrays range from 0 to 16.

We will need to change our data slightly so that our models will be able to learn from them:

* flatten the images: some of our classifiers cannot learn from 2-D arrays, thus we will need to change all of our images to 1-D arrays;
* zero-mean the images: subtract the average of the image from each pixel; and
* normalize the images: scale our array to only contain values between -1 and 1.

You may wonder why we are normalizing and zero-meaning the images.
Machine learning models always work better with input values that range between -1 and 1.
For a short explanation why, look [here](https://www.coursera.org/lecture/deep-neural-network/normalizing-inputs-lXv6U).

In [None]:
# flatten
digits.images = digits.images.reshape(-1, 8*8)

# double check
num_dimensions = len(digits.images[0].shape)
flat_image_length = digits.images[0].shape[0]
print("All images are now {}-D arrays that are {} long.".format(num_dimensions, flat_image_length))

In [None]:
# zero-mean
means = digits.images.mean(axis=0)
digits.images -= means

# double check
print("The average of the data is '{}' (which is essensially zero).".format(digits.images.mean()))

In [None]:
# normalize
# get min and max values
min_values = digits.images.min(axis=0)
max_values = digits.images.max(axis=0)

# get the larges magnitudes
max_magnitudes = np.maximum(max_values, np.abs(min_values))

# check for zero's (as we cannot divide by zero) and set them to 1 (since dividing by 1 does nothing)
max_magnitudes[max_magnitudes == 0] = 1

# normalize the data
digits.images /= max_magnitudes

# double check
min_value = digits.images.min()
max_value = digits.images.max()
print("The minimum value in the images is '{}' and the maximum is '{}'.".format(min_value, max_value))

In [None]:
example_image = digits.images[0]
print("Now the previous example image looks like this:")
print(example_image)

Now that our input data (images) are ready for our model, we need to make sure our output data (the labels / targets) are also formatted in a way that our model will understand.

Let's look at what the labels look like right now.

In [None]:
num_labels = digits.target.shape[0]
print("There are {} labels / targets in the dataset.".format(num_images))
print("This makes sense since each image needs a label - so that we know what number / digit that image is.")

num_dimensions = len(digits.target[0].shape)
print("Each label is stored in a {}-D array.".format(num_dimensions))

So, we can see that the labels are scalar values that are all stored in the 1-D array `digits.target`.

Let's see what these scalars look like:

In [None]:
# print the first 100 labels
print(digits.target[:100])

We see that the scalar value are the integer value corresponding to the number represented in the image.
While it is possible to train a regression model to directly try output a continuous number and see what label it is closest to, this approach doesn't work very well and gives us no insight into how sure the model is of it's prediction.
Instead, we train our models to output a probability for each label.
For instance, Pr(label == 0 | image[0]) = 0.9; Pr(label == 1 | image[0]) = 0.01; ...
and we put all of these predictions in an array and have our model give this as output:

Model(image[0]) -> [Pr(label == 0 | image[0]), Pr(label == 1 | image[0]), ..., Pr(label == 9 | image[0])]

Since we want our model to produce output in this form, we need to give it examples of output that is in this form.
We need to change each scalar value into an array with the corresponding probability values.
We call this new form the "one-hot encoding".

The one-hot encoding for each digit is as follows:

0 -> [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] (as we know this is a zero, so there is a probability of 1 for this label being zero and a probability of 0 for this label being something else)

1 -> [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] (as we know this is a one, so there is a probability of 1 for this label being one and a probability of 0 for this label being something else)

2 -> [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] (...)

3 -> [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] (...)

4 -> [0, 0, 0, 0, 1, 0, 0, 0, 0, 0] (...)

5 -> [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] (...)

6 -> [0, 0, 0, 0, 0, 0, 1, 0, 0, 0] (...)

7 -> [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] (...)

8 -> [0, 0, 0, 0, 0, 0, 0, 0, 1, 0] (...)

9 -> [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] (...)

This is how you create a one-hot encoding of the labels using numpy:

In [None]:
# create a 2-D array to store all the one-hot encodings
one_hot_labels = np.zeros((num_labels, 10))

# set all the corresponding values to 1
one_hot_labels[np.arange(num_labels), digits.target] = 1

I will leave it as an exercise for you to figure out how and why this works.

Let's have a look at what our one-hot encodings look like now:

In [None]:
# print the first 10 labels and corresponding one-hot encodings
print(digits.target[:10])
print(one_hot_labels[:10])

Great, we can see that it is working properly.

Now we need to divide our data into training, test and validation sets.
The first thing we need to do is make sure that each set has some examples of each and that the data is in no particular order (if the data is in some order, the model might learn this pattern and think that it will always be present and this is not necessarily the case - there are also other reasons for doing this).
We can achieve both of these by shuffling the data.
But, remember that we need to shuffle both the images and the one-hot encodings in a way that they still match and if we shuffle both arrays individually, this will almost certainly not be the case.
This is how I suggest going about the shuffling process:

In [None]:
shuffled_indices = np.arange(digits.images.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_images = digits.images[shuffled_indices]
shuffled_labels = one_hot_labels[shuffled_indices]

That should do it, let's make sure they all still match:

In [None]:
def display_preprocessed(n):
    image = shuffled_images[n].reshape((8, 8))
    label = np.argmax(shuffled_labels[n])
    
    # Display the n-th digit
    ## create a figure to plot on
    plt.figure(figsize=(5, 5))
    
    ## set the title of the plot
    plt.title("This is a {}".format(label))
    
    ## plot the digit as an image
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    
    ## remove the numbers on the axes
    plt.xticks([])
    plt.yticks([])
    
    ## show the plot
    plt.show()

# create a slider-bar
interact(display_preprocessed, n=(0, shuffled_images.shape[0] - 1))

Note that the images look a little different than before, this is ok and expected.
The important thing is that we can see that the labels do still match the images.

Now it is time to split the data:

In [None]:
train_split_index = int(num_images * 0.8)
train_input = shuffled_images[ : train_split_index]
train_output = shuffled_labels[ : train_split_index]

test_split_index = int(num_images * 0.9)
test_input = shuffled_images[train_split_index : test_split_index]
test_output = shuffled_labels[train_split_index : test_split_index]

validation_input = shuffled_images[test_split_index : ]
validation_output = shuffled_labels[test_split_index : ]

Let's see if this split correctly:

In [None]:
print("There are {} images in our training set.".format(train_input.shape[0]))
print("There are {} images in our test set.".format(test_input.shape[0]))
print("There are {} images in our validation set.".format(validation_input.shape[0]))

print("There are {} labels in our training set.".format(train_output.shape[0]))
print("There are {} labels in our test set.".format(test_output.shape[0]))
print("There are {} labels in our validation set.".format(validation_output.shape[0]))

print("There are {} images in total.".format(train_input.shape[0] + test_input.shape[0] + validation_input.shape[0]))

Perfect, it all works out.
Now that we have gone through all the effort of formatting our data for our model, let's save it:

In [None]:
np.save("train_input", train_input)
np.save("train_output", train_output)

np.save("test_input", test_input)
np.save("test_output", test_output)

np.save("validation_input", validation_input)
np.save("validation_output", validation_output)