# Instructions
* One vs all logistic regression
* Softmax regression = generalization to handle multiple classes
* Neural network with one hidden layer, and numerically checking the gradient
* Now 2 hidden layers and different activation f'ns, see what performs best
* With best model, do confusion matrix on test set
* 4 page report

In [1]:
# notebook setup
import random 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.decomposition import PCA

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Importing the data

We have 50 000 32x32 images in the train set and a "labels" file with 50 000 lines.

For the images, we'll store them as a 50000x3072 np array, so the first 1024 columns are the value of the red pixel in the image of that row, and the next two 1024 columns are the green and blue values.

In [2]:
def get_data(cifar_dirname="D:\\Unicamp\\MC886\\Git\\T2\\train\\", upperbound=50000):    
    classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
    X = np.empty(shape=(1,3072))
    # we load the 50 000 images in X one after the other, one image being 1x3072, to obtain the 50000x3072 array
    for i in range(upperbound):
        # to have an update of where we're at once in a while
        if i % 1000 == 0:
            print(i)
        # open the current image
        current_im  = Image.open(cifar_dirname + str(i).zfill(5) + ".png" )
        # reshape it into an (1,3072) array, so the red values of all the pixels, then green then blue (32*32*3)
        reshaped_im = np.reshape(np.asarray(current_im, 'uint8'), (1,3072))
        # vertical stack the image into X, what will contain all the images
        X           = np.vstack([X, reshaped_im])
        
    # we want the array to be of type unsigned int on 8 bit (between 0 and 255), so that it occupies the minimum space
    X = X.astype("uint8")
    X = np.delete(X, (0), axis=0) # delete the first row that's empty
    return X

In [8]:
#X = get_data()
# we load the 50 000 labels for the txt file
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")
y.shape
# since loading the images took a lot of time (~3hr), we'll save them in a binary file (.npy)
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images", X)

In [3]:
# now to get the nparray back
X = np.load("D:\\Unicamp\\MC886\\Git\\T2\\images.npy")
# also, for the y array, like earlier:
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")

# One vs all classifiers

We need to do feature scaling on the images before feeding them into the algorithms. We first get the values between -1 and 1: since they're all between 0 and 255, we'll divide by 127 and substract 1.

We then calculate the mean of each image and substract each row by that value.

We then try to reduce the number of feutres by projecting on a principal subspace, with the PCA algorithm. To do so we first run it on the data, then print the variances and decide the number of features we want to keep.

In [4]:
# standardization of the data
X = np.divide(X, 127).astype("float64")
X -= 1

# calculate the mean of each image
mean = np.mean(X, axis=1) # shape (50000,)
X = (X.transpose() - mean) # substract each row by corresponding mean
X = X.transpose() # transpose X back

# apply PCA
pca = PCA()
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [5]:
for i in range(-10,10):
    a = 0
    for x in pca.explained_variance_:
        if x < pow(10,i):
            a = a + 1
    print("< e^"+str(i)+": "+str(a))

< e^-10: 1
< e^-9: 1
< e^-8: 1
< e^-7: 1
< e^-6: 1
< e^-5: 37
< e^-4: 584
< e^-3: 1365
< e^-2: 2141
< e^-1: 2759
< e^0: 3007
< e^1: 3062
< e^2: 3071
< e^3: 3072
< e^4: 3072
< e^5: 3072
< e^6: 3072
< e^7: 3072
< e^8: 3072
< e^9: 3072


We'll try to keep 2400 features, thus disregarding the ~600 features with a covariance inferiour to 10^-4. This will already help a lot with the calculations.

In [6]:
pca.n_components = 2400
X_reduced = pca.fit_transform(X)
X_reduced.shape
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images_reduced.npy", X_reduced)

In [7]:
X_reduced = np.load("D:\\Unicamp\\MC886\\Git\\T2\\images_reduced.npy")
y = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\train\\labels")

In [8]:
X_reduced[0:10]

array([[  1.16218563e+01,   4.35054736e+00,  -1.06141394e+01, ...,
          4.35081176e-03,   1.32960065e-03,   1.23443474e-03],
       [  7.61379453e-01,   1.16192626e+01,   1.71488161e+00, ...,
         -2.13114500e-03,   2.52875689e-03,  -6.36138839e-03],
       [ -2.76356677e+01,   5.13644401e-01,  -1.40725825e+00, ...,
         -1.36695424e-03,  -1.30393012e-03,  -7.70575473e-03],
       ..., 
       [  8.69513291e+00,  -6.43107970e+00,  -2.49142396e+00, ...,
         -2.55107013e-03,   1.80564627e-03,  -4.57465015e-03],
       [ -6.48995440e-01,  -8.32589757e-01,   1.58659106e+01, ...,
         -8.67655850e-04,  -5.13843359e-03,   2.22656382e-02],
       [ -1.20069510e+01,  -4.86847553e+00,  -1.58816572e+00, ...,
         -3.69728451e-04,  -2.61162974e-03,   1.34713022e-04]])

We now train our 10 different logistic regressions, using the one vs all method (one class at 1, the others at 0), and collect our 10 classifiers in a list.

In [9]:
from sklearn.linear_model import LogisticRegression

def classifier_onevsall(data, labels, num_class):
    labels_onevsall = (labels == num_class).astype(int)
    logreg = LogisticRegression()
    logreg.fit(data, labels_onevsall)
    return logreg

classifiers = []
for i in range(10):
    print(i)
    classifiers.append(classifier_onevsall(X_reduced, y, i))

print(classifiers)

0
1
2
3
4
5
6
7
8
9
[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random

We'll now evaluate our models on the training data and test data. We hence first need to import and process the test data, we'll write a f'n for speeding up the preprocessing.

In [10]:
def preprocessing_data(data):
    # get values between -1 and 1
    data = np.divide(data, 127).astype("float64")
    data -= 1
    
    # calculate the mean of each image
    mean = np.mean(data, axis=1) # shape (50000,)
    data = (data.transpose() - mean) # substract each row by corresponding mean
    # note that data is now transposed, shape (3072, 50000), let's put it back
    data = data.transpose()
    
    # apply the PCA algorithm
    pca = PCA(n_components=2400)
    data = pca.fit_transform(data)
    print(data.shape)
    return data

In [12]:
X_test = get_data("D:\\Unicamp\\MC886\\Git\\T2\\test\\", 10000)
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images_test.npy", X_test)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [13]:
X_test_reduced = preprocessing_data(X_test)
np.save("D:\\Unicamp\\MC886\\Git\\T2\\images_test_reduced.npy", X_test_reduced)

(10000, 2400)


In [14]:
X_test = np.load("D:\\Unicamp\\MC886\\Git\\T2\\images_test_reduced.npy")
y_test = np.loadtxt("D:\\Unicamp\\MC886\\Git\\T2\\test\\labels")

Now we run the models on our test data, and see how well they perform.

In [15]:
def test_onevsall(classifiers, data, nbr_classes=10):
    predictions = np.empty((data.shape[0],nbr_classes))
    for i in range(nbr_classes):
        predictor = classifiers[i]
        # put the predicted values by each classifier for the whole data (shape (nbr_elements,nbr_classes))
        predictions[:, i] = predictor.predict(data)
    # return the indice of highest element (so where 1 is predicted)
    # if more than one classifier returned 1 for that sample, first encountered is kept
    print(predictions.shape)
    pred_indices = np.argmax(predictions, axis=1)
    return pred_indices

In [16]:
train_predictions = test_onevsall(classifiers, X_reduced)

(50000, 10)


In [17]:
test_predictions = test_onevsall(classifiers, X_test_reduced, 10)

(10000, 10)


In [18]:
# now that we have our predictions, we'll calculate the percentage of right answers
print(np.mean(y == train_predictions))
print(np.mean(y_test == test_predictions))

0.2683
0.115


On our train data we do fairly well (26,8%), but on the test data we have very bad results, only 11,5% which is only slightly better than just choosing the same class everytime (10%).

We'll hence try a more powerful model and build a Softmax regression.

We'll now create a function to create layers of neurons

In [27]:
import tensorflow as tf

def neuron_layer(X, n_neurons, name, activation=None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        stddev = 2 / np.sqrt(n_inputs)
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        W = tf.Variable(init, name="kernel")
        b = tf.Variable(tf.zeros([n_neurons]), name="bias")
        Z = tf.matmul(X,W) + b
        
        if activation is not None:
            return activation(Z)
        else:
            return Z

We'll now proceed to create the layers

First we'll create 1 layer and analyze it's output

Notice that we'll already leave the 2nd hidden layer already setted.
Also, a good way to choose the number of neurons on each layer is to remember a funnel in order to narrow and filter through each layer

In [None]:
X = tf.placeholder(tf.float32, [None, 2400])
Y = tf.placeholder(tf.float32, None)
n_inputs = 2400
n_hidden1 = 1000
n_hidden2 = 100
n_outputs = 10

with tf.name_scope("dnn"):
    hidden1 = neuron_layer(X, n_hidden1, name="hidden1")

In [26]:
import tensorflow as tf
# placeholder for our data, shape (nbr_samples, 2400)
x = tf.placeholder(tf.float32, [None, 2400])
# our weights W (2400, 10) and biases b for each class (10)
W = tf.Variable(tf.zeros([2400, 10]), tf.float32)
b = tf.Variable(tf.zeros([10], tf.float32))
# executing Wx + b 
y = tf.matmul(x, W) + b

# now to define our cost f'n (cross entropy) and do softmax, then get the computed losses
total_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(_sentinel=None, labels=y, logits=y_test))
# choose our optimization algo (gradient descent) minimizing our cost
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(total_loss)

ValueError: No gradients provided for any variable, check your graph for ops that do not support gradients, between variables ["<tf.Variable 'Variable:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_1:0' shape=(10,) dtype=float64_ref>", "<tf.Variable 'Variable_2:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_3:0' shape=(10,) dtype=float32_ref>", "<tf.Variable 'Variable_4:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_5:0' shape=(10,) dtype=float32_ref>", "<tf.Variable 'Variable_6:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_7:0' shape=(10,) dtype=float32_ref>", "<tf.Variable 'Variable_8:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_9:0' shape=(10,) dtype=float32_ref>", "<tf.Variable 'Variable_10:0' shape=(2400, 10) dtype=float32_ref>", "<tf.Variable 'Variable_11:0' shape=(10,) dtype=float32_ref>"] and loss Tensor("Mean:0", shape=(), dtype=float64).