# Instructions
* One vs all logistic regression
* Softmax regression = generalization to handle multiple classes
* Neural network with one hidden layer, and numerically checking the gradient
* Now 2 hidden layers and different activation f'ns, see what performs best
* With best model, do confusion matrix on test set
* 4 page report

In [1]:
# notebook setup
import random 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.decomposition import PCA

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Importing the data

We have 50 000 32x32 images in the train set and a "labels" file with 50 000 lines.

For the images, we'll store them as a 50000x3072 np array, so the first 1024 columns are the value of the red pixel in the image of that row, and the next two 1024 columns are the green and blue values.

In [None]:
def get_data(cifar_dirname="/home/zoug/Cours/MC886/tarefa2/cifar-10/train/"):    
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    X = np.empty(shape=(1,3072))
    # we load the 50 000 images in X one after the other, one image being 1x3072, to obtain the 50000x3072 array
    for i in range(50000):
        # to have an update of where we're at once in a while
        if i % 1000 == 0:
            print(i)
        # open the current image
        current_im  = Image.open(cifar_dirname + str(i).zfill(5) + ".png" )
        # reshape it into an (1,3072) array, so the red values of all the pixels, then green then blue (32*32*3)
        reshaped_im = np.reshape(np.asarray(current_im, 'uint8'), (1,3072))
        # vertical stack the image into X, what will contain all the images
        X           = np.vstack([X, reshaped_im])
        
    # we want the array to be of type unsigned int on 8 bit (between 0 and 255), so that it occupies the minimum space
    X = X.astype("uint8")
    # we load the 50 000 labels for the txt file
    y = np.loadtxt(cifar_dirname+"labels")
    return X, y

X, y = get_data()
X.shape
y.shape

# since loading the images took a lot of time (~3hr), we'll save them in a binary file (.npy)
np.save("/home/zoug/Cours/MC886/tarefa2/cifar-10/train/images", X)

0


In [2]:
# now to get the nparray back
X = np.load("/home/zoug/Cours/MC886/tarefa2/cifar-10/train/images.npy")
# also, for the y array, like earlier:
y = np.loadtxt("/home/zoug/Cours/MC886/tarefa2/cifar-10/train/labels")

# Preprocessing the data

We need to do feature scaling on the images before feeding them into the algorithms. We first get the values between -1 and 1: since they're all between 0 and 255, we'll divide by 127 and substract 1.

We then calculate the mean of each image and substract each row by that value.

In [6]:
def preprocessing_data(data):
    # get values between -1 and 1
    data = np.divide(data, 127).astype("float64")
    data -= 1
    
    # calculate the mean of each image
    mean = np.mean(data, axis=1) # shape (50000,)
    data = (data.transpose() - mean) # substract each row by corresponding mean
    # note that data is now transposed, shape (3072, 50000)
    
    # apply the PCA algorithm
    #pca = PCA(n_components=3072)
    #data = pca.fit_transform(data)
    #print(data.shape)
    return data.transpose() # we'll then transpose it back to 50000,3072

X = preprocessing_data(X)
X[0:10]

MemoryError: 

X_scaled.dtype

We then try to reduce the number of feutres by projecting on a principal subspace, with the PCA algorithm. To do so we first run it on the data, then print the variances and decide the number of features we want to keep.

In [8]:
# basic standardization (PCA does that already)
X = np.divide(X, 127).astype("float64")
X -= 1
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [9]:
for i in range(-10,10):
    a = 0
    for x in pca.explained_variance_:
        if x < pow(10,i):
            a = a + 1
    print("< e^"+str(i)+": "+str(a))

< e^-10: 0
< e^-9: 0
< e^-8: 0
< e^-7: 0
< e^-6: 0
< e^-5: 36
< e^-4: 583
< e^-3: 1364
< e^-2: 2140
< e^-1: 2758
< e^0: 3006
< e^1: 3061
< e^2: 3071
< e^3: 3072
< e^4: 3072
< e^5: 3072
< e^6: 3072
< e^7: 3072
< e^8: 3072
< e^9: 3072


We'll try to keep 2400 features, thus disregarding the ~600 features with a covariance inferiour to 10^-4. This will already help a lot with the calculations.

In [None]:
pca.n_components = 2400
X_reduced = pca.fit_transform(X)
X_reduced.shape