# Load data notebook

This notebook contains cells which load or create different data sets. Copy these cells over to your other notebooks when you want to try a method out on a particular data set! For more data, look at these websites:
* https://cs.nyu.edu/~roweis/data.html
* https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py

## Olivetti faces

In [None]:
# This cell load the Olivetti Faces data set
# More information can be found here: https://cs.nyu.edu/~roweis/data.html

# Import SciPy so we can load Matlab .mat files
import scipy.io as sio

# Load the faces data set and extract the faces
faces_full = sio.loadmat('olivettiFaces.mat')
faces = faces_full['faces']

## MNIST handwritten digits

In [None]:
# This cell loads the MNIST handwritten digits data set
# More information can be found here: https://cs.nyu.edu/~roweis/data.html

# Import SciPy so we can load Matlab .mat files
import scipy.io as sio

# Load the MNIST data set
# It is partitioned by digit and by training and testing data (the latter for supervised classification)
mnist_full = sio.loadmat('mnist_all.mat')

## Squares images

In [None]:
# This cell defines the make_squares function, which creates the images with small squares in them

# We'll need numpy for this
import numpy as np

# This is the function
def make_squares(N, n):
    '''
    squares = MAKE_SQUARES(N, n)
    Makes the squares data set.
    INPUTS:  N: The images are NxN
             n: The small squares in the images are (2n+1)x(2n+1)
    OUTPUTS: squares: Numpy array of size N^2 x N x N in which squares[i,:,:] is the ith square image
    '''
    squares = np.zeros([N**2, N, N])
    c = 0
    for i in range(N):
        for j in range(N):
            indi = np.mod(np.arange(i-n, i+n+1), N)
            indj = np.mod(np.arange(j-n, j+n+1), N)
            for k in range(2*n+1):
                squares[c, indi, indj[k]] = 1
            c += 1
    
    return squares

## Noisy circles

In [None]:
# Load the datasets module from Sklearn; also load pyplot to display
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline

# Load the data and display it
noisy_circles = datasets.make_circles(n_samples=1024, factor=.5, noise=.05)
noisy_circles_data = noisy_circles[0]
noisy_circles_labels = noisy_circles[1]
plt.scatter(noisy_circles_data[:,0], noisy_circles_data[:,1], [], noisy_circles_labels)
plt.axis('equal');

## Noisy moons

In [None]:
# Load the datasets module from Sklearn; also load pyplot to display
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline

# Load the data and display it
noisy_moons = datasets.make_moons(n_samples=1024, noise=.05)
noisy_moons_data = noisy_moons[0]
noisy_moons_labels = noisy_moons[1]
plt.scatter(noisy_moons_data[:,0], noisy_moons_data[:,1], [], noisy_moons_labels)
plt.axis('equal');