In [1]:
# import a package that can plot graphics for us
import matplotlib.pyplot as plt

# import a tool that will help us visualize the data
from ipywidgets import interact

# import the dataset
from sklearn.datasets import load_digits

# import a package that will help us "preprocess" the data (name it understandable to our classifiers)
import numpy as np

Let's load the data and see what it looks like.
We can use the `interact` function to scroll through the images in the data set.

In [2]:
# Load the digits dataset
digits = load_digits()

def display_n(n):
    digit_image = digits.images[n]
    digit_label = digits.target[n]
    
    # Display the n-th digit
    ## create a figure to plot on
    plt.figure(figsize=(5, 5))
    
    ## set the title of the plot
    plt.title("This is a {}".format(digit_label))
    
    ## plot the digit as an image
    plt.imshow(digit_image, cmap=plt.cm.gray_r, interpolation='nearest')
    
    ## remove the numbers on the axes
    plt.xticks([])
    plt.yticks([])
    
    ## show the plot
    plt.show()
    
# create a slider-bar that will let us scroll through and look at the images
interact(display_n, n=(0, digits.images.shape[0] - 1))

interactive(children=(IntSlider(value=898, description='n', max=1796), Output()), _dom_classes=('widget-intera…

<function __main__.display_n(n)>

We can see that the dataset contains images and a "label" (or target) for each image.
This label tells us that this is an image of a eight (for example).
We will need to use both the images and the labels to train our classifiers.

Let's have a better look how the images are actually represented:

In [3]:
num_images = digits.images.shape[0]
print("There are {} images in the dataset.".format(num_images))

There are 1797 images in the dataset.


In [4]:
num_dimensions = len(digits.images[0].shape)
print("The images are stored in a {}-D array.".format(num_dimensions))

The images are stored in a 2-D array.


In [5]:
example_image_shape = digits.images[0].shape
print("The array the image is stored has a shape of {} (this is the size in each dimension).".format(example_image_shape))

The array the image is stored has a shape of (8, 8) (this is the size in each dimension).


In [6]:
example_image = digits.images[0]
print("The array actually looks like this:")
print(example_image)

The array actually looks like this:
[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]


In [7]:
min_value = digits.images.min()
print("The minimum value in the images is '{}'.".format(min_value))

The minimum value in the images is '0.0'.


In [8]:
max_value = digits.images.max()
print("The maximum value in the images is '{}'.".format(max_value))

The maximum value in the images is '16.0'.


We can see that our images are stored in 2-D arrays and that the values in the arrays range from 0 to 16.

We will need to change our data slightly so that our models will be able to learn from them:

* flatten the images: some of our classifiers cannot learn from 2-D arrays, thus we will need to change all of our images to 1-D arrays;
* zero-mean the images: subtract the average of the image from each pixel; and
* normalize the images: scale our array to only contain values between -1 and 1.

You may wonder why we are normalizing and zero-meaning the images.
Machine learning models always work better with input values that range between -1 and 1.
For a short explanation why, look [here](https://www.coursera.org/lecture/deep-neural-network/normalizing-inputs-lXv6U).

In [9]:
# flatten
digits.images = digits.images.reshape(-1, 8*8)

# double check
num_dimensions = len(digits.images[0].shape)
flat_image_length = digits.images[0].shape[0]
print("All images are now {}-D arrays that are {} long.".format(num_dimensions, flat_image_length))

All images are now 1-D arrays that are 64 long.


In [10]:
# zero-mean
means = digits.images.mean(axis=0)
digits.images -= means

# double check
print("The average of the data is '{}' (which is essensially zero).".format(digits.images.mean()))

The average of the data is '-2.7678348081918204e-17' (which is essensially zero).


In [11]:
# normalize
# get min and max values
min_values = digits.images.min(axis=0)
max_values = digits.images.max(axis=0)

# get the larges magnitudes
max_magnitudes = np.maximum(max_values, np.abs(min_values))

# check for zero's (as we cannot divide by zero) and set them to 1 (since dividing by 1 does nothing)
max_magnitudes[max_magnitudes == 0] = 1

# normalize the data
digits.images /= max_magnitudes

# double check
min_value = digits.images.min()
max_value = digits.images.max()
print("The minimum value in the images is '{}' and the maximum is '{}'.".format(min_value, max_value))

The minimum value in the images is '-1.0' and the maximum is '1.0'.


In [12]:
example_image = digits.images[0]
print("Now the previous example image looks like this:")
print(example_image)

Now the previous example image looks like this:
[  0.00000000e+00  -3.94793926e-02  -1.89700500e-02   9.83591142e-02
  -2.40383261e-01  -4.67977345e-01  -9.30656934e-02  -8.71940723e-03
  -2.79017857e-03  -1.42357662e-01   2.52130568e-01   2.52148465e-01
  -2.71762668e-02   8.34671930e-01   2.22811984e-01  -9.07814693e-03
  -1.39314572e-03   2.97379242e-02   5.14666217e-01  -5.54306191e-01
  -7.97337001e-01   3.89771801e-01   4.37074164e-01  -6.29987400e-03
  -1.11420613e-03   1.22129946e-01   3.19948583e-01  -1.00000000e+00
  -1.00000000e+00   5.30891846e-02   4.48047389e-01  -2.23089794e-03
   0.00000000e+00   2.28166460e-01   3.99358889e-02  -1.00000000e+00
  -1.00000000e+00   2.92751225e-02   4.59006523e-01   0.00000000e+00
  -2.23089794e-03   1.67605960e-01   4.51666056e-01  -8.24018271e-01
  -8.01202807e-01   4.56928586e-01   2.82507431e-01  -4.56535917e-03
  -9.05103391e-04   8.46976643e-02   7.64513170e-01  -4.75848792e-01
   6.19939720e-02   3.70099752e-01  -3.03472663e-01  -1