# CIFAR - 10

In [None]:
import cifar10
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import GridSearchCV
import pandas as pd

## Loading Data

In [None]:
# Importing the data

cifar10.data_path = "data/CIFAR-10/"

In [None]:
# Downloading and unpacking the dataset

cifar10.maybe_download_and_extract()

In [None]:
# checking the classification classes

class_names = cifar10.load_class_names()
class_names

In [None]:
# Loading the training and the testing data present in the cifar10

images_train, cls_train, labels_train = cifar10.load_training_data()
images_test, cls_test, labels_test = cifar10.load_test_data()

## Exploratory Data Analysis

In [None]:
images_train.shape

The training data contains 50,000 images each 32 X 32 pixels. And moreover we have the red, green and blue values for each pixel. So we have 3 matrices of 32X32 each for every image.

In [None]:
labels_train[4]

The labels/output is one hot encoded into an array of 10.

In [None]:
# Class Frequency 
pd.DataFrame(cls_train).value_counts()

In [None]:
# Let us plot the figure to get a better understanding of the image

fig = plt.figure(figsize=(20,20))
# Now we will make subplots, rows and columns
for i in range(32):
    ax = fig.add_subplot(8,8,i+1)
#     ax = fig.add_subplot(number of rows , number of columns , position starting from 1)
#     cmap = plt.cm.bone makes the images black and white theme rather than green scale
    ax.imshow(images_train[i] , cmap = plt.cm.bone)
plt.show()


## Feature Extraction

### We will treat each pixel as a feature, so we can reshape the three 32X32 matrices into an array of 1024 * 3 elements where each element will be a feature

In [None]:
X_train = np.reshape(images_train , (50000 , 3*1024))
Y_train = cls_train

X_test = np.reshape(images_test , (10000 , 3*1024))

In [None]:
X_train.shape

In [None]:
X_test.shape

## Dimensionality reduction using PCA

In [None]:
pca = PCA()
pca.fit(X_train)

### We will now find the number features we want to keep, if we want 99 percent of the data secured

In [None]:
# initializing the optimal k
k = 0
total = sum(pca.explained_variance_)
current = 0
# We want the to retain 99 percent of the data so we keep current/total to be till 0.99
while(current/total < 0.99):
    # We keep adding the variance by the k_th feature till we reach 99 %.
    current += pca.explained_variance_[k]
    k += 1
k

## Feature Scaling 

### We will now scale our data with Min_Max_Scaler so that no feature can overpower some other due to high values

In [None]:
from sklearn.preprocessing import MinMaxScaler

# We want every data point to be between 0 and 1
scaler = MinMaxScaler(feature_range = (0,1))

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Now we have the value of number of features that we need to keep and also we have scaled our data, so now we train our PCA

In [None]:
pca = PCA(n_components = k , whiten = True)
X_train = pca.fit_transform(X_train)

# We will fit the X_test in the same model as well as we will bring it to lower dimensionality
X_test = pca.fit(X_test)

# Model Training

## Trying Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression(C = 10)
lr.fit(X_train , Y_train)

In [None]:
Y_test = cls_test
Y_pred = lr.predict(X_test)

In [None]:
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

### We got satisfactory results with logistic regression so we will try our next algorithm

## Trying Random-Forest

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, Y_train)

In [None]:
rf.score(X_test , Y_test)

In [None]:
Y_pred = rf.predict(X_test)

In [None]:
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

### We got almost similar result as Logistic regression, so we won't do hyper-parameter tuning and test next algorithm

## Trying KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
clf = KNeighborsClassifier(n_neighbors = 4)
clf.fit(X_train , Y_train)

In [None]:
clf.score(X_test , Y_test)

In [None]:
Y_pred = rf.predict(X_test)

In [None]:
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

## Trying Gaussian Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb.fit(X_train, Y_train)

In [None]:
gb.score(X_test , Y_test)

In [None]:
Y_pred = rf.predict(X_test)

In [None]:
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

## Trying Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)

In [None]:
mnb.score(X_test , Y_test)

In [None]:
Y_pred = rf.predict(X_test)

In [None]:
print(classification_report(Y_test , Y_pred))
print(confusion_matrix(Y_test , Y_pred))

## Trying Support Vector Machine (SVM)

In [None]:
from sklearn import svm
sv = svm.SVC()
sv.fit(X_train , Y_train)

In [None]:
sv.score(X_test , Y_test)

## Hyper-Parameter Tuning

#### We got descent results with SVM, so we will find the optimized model using grid search.

In [None]:
clf_temp = svm.SVC()
param_grid = {'C': [0.1, 1, 10, 100, 500], 'gamma': [0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf' , 'linear' , 'sigmoid']}
gs = GridSearchCV(clf_temp,param_grid, n_jobs = 2)
gs.fit(X_train , Y_train)
t = gs.best_estimator_

In [None]:
clf = svm.SVC(C = 100 , gamma = 0.005 , kernel = 'linear')
#(param_grid = **t)
clf.fit(X_new_train_temp , Y_train_temp)

In [None]:
def convert(Y_pred):
    prediction = np.array(["automobile"] * 10000)
    for i in range(10000):
        if (Y_pred[i] == 0):
            prediction[i] = "airplane"
        elif (Y_pred[i] == 1):
            prediction[i] = "automobile"
        elif (Y_pred[i] == 2):
            prediction[i] = "bird"
        elif (Y_pred[i] == 3):
            prediction[i] = "cat"
        elif (Y_pred[i] == 4):
            prediction[i] = "deer"
        elif (Y_pred[i] == 5):
            prediction[i] = "dog"
        elif (Y_pred[i] == 6):
            prediction[i] = "frog"
        elif (Y_pred[i] == 7):
            prediction[i] = "horse"
        elif (Y_pred[i] == 8):
            prediction[i] = "ship"
        else:
            prediction[i] = "truck"
    return prediction