# Instructions
* One vs all logistic regression
* Softmax regression = generalization to handle multiple classes
* Neural network with one hidden layer, and numerically checking the gradient
* Now 2 hidden layers and different activation f'ns, see what performs best
* With best model, do confusion matrix on test set
* 4 page report

In [2]:
# notebook setup
import random 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.decomposition import PCA

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Importing the data

We have 50 000 32x32 images in the train set and a "labels" file with 50 000 lines.

For the images, we'll store them as a 50000x3072 np array, so the first 1024 columns are the value of the red pixel in the image of that row, and the next two 1024 columns are the green and blue values.

In [3]:
def get_data(cifar_dirname="D:\\Unicamp\\MC886\\Git\\T2\\train\\", upperbound=50000):    
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    X = np.empty(shape=(1,3072))
    # we load the 50 000 images in X one after the other, one image being 1x3072, to obtain the 50000x3072 array
    for i in range(upperbound):
        # to have an update of where we're at once in a while
        if i % 1000 == 0:
            print(i)
        # open the current image
        current_im  = Image.open(cifar_dirname + str(i).zfill(5) + ".png" )
        # reshape it into an (1,3072) array, so the red values of all the pixels, then green then blue (32*32*3)
        reshaped_im = np.reshape(np.asarray(current_im, 'uint8'), (1,3072))
        # vertical stack the image into X, what will contain all the images
        X           = np.vstack([X, reshaped_im])
        
    # we want the array to be of type unsigned int on 8 bit (between 0 and 255), so that it occupies the minimum space
    X = X.astype("uint8")
    X = np.delete(X, (0), axis=0) # delete the first row that's empty
    return X

In [8]:
#X = get_data()
# we load the 50 000 labels for the txt file
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")
y.shape
# since loading the images took a lot of time (~3hr), we'll save them in a binary file (.npy)
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images", X)

In [9]:
# now to get the nparray back
X = np.load("D:\\Unicamp\\MC886\\Git\\T2\\images.npy")
# also, for the y array, like earlier:
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")

# One vs all classifiers

We need to do feature scaling on the images before feeding them into the algorithms. We first get the values between -1 and 1: since they're all between 0 and 255, we'll divide by 127 and substract 1.

We then calculate the mean of each image and substract each row by that value.

We then try to reduce the number of feutres by projecting on a principal subspace, with the PCA algorithm. To do so we first run it on the data, then print the variances and decide the number of features we want to keep.

In [12]:
# standardization of the data
X = np.divide(X, 127).astype("float64")
X -= 1

# calculate the mean of each image
mean = np.mean(X, axis=1) # shape (50000,)
X = (X.transpose() - mean) # substract each row by corresponding mean
X = X.transpose() # transpose X back

# apply PCA
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [13]:
for i in range(-10,10):
    a = 0
    for x in pca.explained_variance_:
        if x < pow(10,i):
            a = a + 1
    print("< e^"+str(i)+": "+str(a))

< e^-10: 2447
< e^-9: 2900
< e^-8: 3042
< e^-7: 3068
< e^-6: 3072
< e^-5: 3072
< e^-4: 3072
< e^-3: 3072
< e^-2: 3072
< e^-1: 3072
< e^0: 3072
< e^1: 3072
< e^2: 3072
< e^3: 3072
< e^4: 3072
< e^5: 3072
< e^6: 3072
< e^7: 3072
< e^8: 3072
< e^9: 3072


We'll try to keep 2400 features, thus disregarding the ~600 features with a covariance inferiour to 10^-4. This will already help a lot with the calculations.

In [14]:
pca.n_components = 2400
X_reduced = pca.fit_transform(X)
X_reduced.shape
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images_reduced.npy", X_reduced)

In [15]:
X_reduced = np.load("D:\\Unicamp\\MC886\\Git\\T2\\images_reduced.npy")
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")

In [16]:
X_reduced[0:10]

array([[  7.20556531e-04,   2.69734476e-04,  -6.58077960e-04, ...,
         -4.34945687e-07,  -6.77224855e-07,  -9.77754013e-07],
       [  4.72056205e-05,   7.20395722e-04,   1.06322873e-04, ...,
          1.20694764e-07,  -4.70810173e-08,   1.56268736e-06],
       [ -1.71341482e-03,   3.18460165e-05,  -8.72501861e-05, ...,
         -1.86572820e-09,  -7.97928204e-08,   1.82248983e-07],
       ..., 
       [  5.39099319e-04,  -3.98727739e-04,  -1.54468594e-04, ...,
          1.54848572e-07,   2.76047903e-07,   2.01948899e-07],
       [ -4.02377977e-05,  -5.16206682e-05,   9.83688422e-04, ...,
         -9.71375751e-07,  -5.52291076e-08,  -3.27327538e-07],
       [ -7.44432452e-04,  -3.01846087e-04,  -9.84664717e-05, ...,
          6.63267718e-07,   4.47269885e-07,  -4.51978997e-07]])

We now train our 10 different logistic regressions, using the one vs all method (one class at 1, the others at 0), and collect our 10 classifiers in a list.

In [17]:
from sklearn.linear_model import LogisticRegression

def classifier_onevsall(data, labels, num_class):
    labels_onevsall = (labels == num_class).astype(int)
    logreg = LogisticRegression()
    logreg.fit(data, labels_onevsall)
    return logreg

classifiers = []
for i in range(10):
    print(i)
    classifiers.append(classifier_onevsall(X_reduced, y, i))

print(classifiers)

0
1
2
3
4
5
6
7
8
9
[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random

We'll now evaluate our models on the training data and test data. We hence first need to import and process the test data, we'll write a f'n for speeding up the preprocessing.

In [18]:
def preprocessing_data(data):
    # get values between -1 and 1
    data = np.divide(data, 127).astype("float64")
    data -= 1
    
    # calculate the mean of each image
    mean = np.mean(data, axis=1) # shape (50000,)
    data = (data.transpose() - mean) # substract each row by corresponding mean
    # note that data is now transposed, shape (3072, 50000), let's put it back
    data = data.transpose()
    
    # apply the PCA algorithm
    pca = PCA(n_components=2400)
    data = pca.fit_transform(data)
    print(data.shape)
    return data

In [19]:
X_test = get_data("C:/Users/yassine.DESKTOP-NR3SF42/Downloads/cifar-10/test/", 10000)
np.save("C:/Users/yassine.DESKTOP-NR3SF42/images_test.npy", X_test)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [13]:
X_test = np.load("C:/Users/yassine.DESKTOP-NR3SF42/images_test.npy")
y_test = np.loadtxt("C:/Users/yassine.DESKTOP-NR3SF42/Downloads/cifar-10/test/labels")

In [16]:
X_test_reduced = preprocessing_data(X_test)
np.save("C:/Users/yassine.DESKTOP-NR3SF42/images_test_reduced.npy", X_test_reduced)

(10000, 2400)


Now we run the models on our test data, and see how well they perform.

In [31]:
def test_onevsall(classifiers, data, nbr_classes=10):
    predictions = np.empty((data.shape[0],nbr_classes))
    for i in range(nbr_classes):
        predictor = classifiers[i]
        # put the predicted values by each classifier for the whole data (shape (nbr_elements,nbr_classes))
        predictions[:, i] = predictor.predict(data)
    # return the indice of highest element (so where 1 is predicted)
    # if more than one classifier returned 1 for that sample, first encountered is kept
    print(predictions.shape)
    pred_indices = np.argmax(predictions, axis=1)
    return pred_indices

In [32]:
train_predictions = test_onevsall(classifiers, X_reduced)

(50000, 10)


In [33]:
test_predictions = test_onevsall(classifiers, X_test_reduced, 10)

(10000, 10)


In [34]:
# now that we have our predictions, we'll calculate the percentage of right answers
print(np.mean(y == train_predictions))
print(np.mean(y_test == test_predictions))

0.268
0.1153
