# Imports And Setup

In [2]:
import pickle
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [3]:
# Data file names
batch1_name = "CIFAR_data/data_batch_1"
batch2_name = "CIFAR_data/data_batch_2"
batch3_name = "CIFAR_data/data_batch_3"
batch4_name = "CIFAR_data/data_batch_4"
batch5_name = "CIFAR_data/data_batch_5"
batchT_name = "CIFAR_data/test_batch"
meta_name = "CIFAR_data/batches.meta"

# Data Loading And Cleaning

In [4]:
# Used to load in the data files
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [5]:
# Load in each data file
batch1 = unpickle(batch1_name)
batch2 = unpickle(batch2_name)
batch3 = unpickle(batch3_name)
batch4 = unpickle(batch4_name)
batch5 = unpickle(batch5_name)
batchT = unpickle(batchT_name)

In [6]:
# What does the data look like?
print(batch1.keys())             # Keys
print(batch1[b'batch_label'])    # Batch label
print(batch1[b'labels'][:10])    # Labels
print(type(batch1[b'labels']))   # Label type
print(batch1[b'data'][0])        # Data
print(type(batch1[b'data']))   # Data type
print(batch1[b'filenames'][:10]) # Filenames

dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
b'training batch 1 of 5'
[6, 9, 9, 4, 1, 1, 2, 7, 8, 3]
<class 'list'>
[ 59  43  50 ... 140  84  72]
<class 'numpy.ndarray'>
[b'leptodactylus_pentadactylus_s_000004.png', b'camion_s_000148.png', b'tipper_truck_s_001250.png', b'american_elk_s_001521.png', b'station_wagon_s_000293.png', b'coupe_s_001735.png', b'cassowary_s_001300.png', b'cow_pony_s_001168.png', b'sea_boat_s_001584.png', b'tabby_s_001355.png']


In [7]:
# Store the labels and data as a numpy array
batch1['labels'] = np.array(batch1[b'labels'], dtype=np.int8)
batch1['data'] = batch1[b'data']
batch2['labels'] = np.array(batch2[b'labels'], dtype=np.int8)
batch2['data'] = batch2[b'data']
batch3['labels'] = np.array(batch3[b'labels'], dtype=np.int8)
batch3['data'] = batch3[b'data']
batch4['labels'] = np.array(batch4[b'labels'], dtype=np.int8)
batch4['data'] = batch4[b'data']
batch5['labels'] = np.array(batch5[b'labels'], dtype=np.int8)
batch5['data'] = batch5[b'data']
batchT['labels'] = np.array(batchT[b'labels'], dtype=np.int8)
batchT['data'] = batchT[b'data']

In [8]:
# Check if the conversion was successful
print(batch1['labels'][:10])
print(batch1['data'][0])

[6 9 9 4 1 1 2 7 8 3]
[ 59  43  50 ... 140  84  72]


In [9]:
# Load in the metadata
meta = unpickle(meta_name)

In [10]:
# Visualize the metadata
meta[b'label_names']

[b'airplane',
 b'automobile',
 b'bird',
 b'cat',
 b'deer',
 b'dog',
 b'frog',
 b'horse',
 b'ship',
 b'truck']

In [11]:
# Convert the metadata to dictionaries
meta_numToLab = {i:str(meta[b'label_names'][i])[2:-1] for i in range(0, len(meta[b'label_names']))}
meta_labToNum = {str(meta[b'label_names'][i])[2:-1]:i for i in range(0, len(meta[b'label_names']))}

In [12]:
meta_numToLab

{0: 'airplane',
 1: 'automobile',
 2: 'bird',
 3: 'cat',
 4: 'deer',
 5: 'dog',
 6: 'frog',
 7: 'horse',
 8: 'ship',
 9: 'truck'}

In [13]:
meta_labToNum

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [14]:
# Combine the data into a single array of labels and an array of data
# and store the data in a dnew dictionary
batchC = {"train":dict(), "test":dict()}
batchC["train"]["Y"] = np.concatenate((batch1["labels"], batch2["labels"], batch3["labels"], batch4["labels"], batch5["labels"]), dtype=np.int8)
batchC["train"]["x"] = np.concatenate((batch1["data"], batch2["data"], batch3["data"], batch4["data"], batch5["data"]), dtype=np.uint8)
batchC["test"]["Y"] = np.array(batchT["labels"], dtype=np.int8)
batchC["test"]["x"] = np.array(batchT["data"], dtype=np.uint8)

In [15]:
# Ensure the conversion was successful
assert np.array_equal(batchC["train"]["Y"][:10], batch1["labels"][:10])
assert np.array_equal(batchC["train"]["Y"][-10:], batch5["labels"][-10:])
assert np.array_equal(batchC["train"]["x"][:10], batch1["data"][:10])
assert np.array_equal(batchC["train"]["x"][-10:], batch5["data"][-10:])

assert np.array_equal(batchC["test"]["Y"][:10], batchT["labels"][:10])
assert np.array_equal(batchC["test"]["Y"][-10:], batchT["labels"][-10:])
assert np.array_equal(batchC["test"]["x"][:10], batchT["data"][:10])
assert np.array_equal(batchC["test"]["x"][-10:], batchT["data"][-10:])

In [16]:
# The data shapes
print(f'Train Labels: {batchC["train"]["Y"].shape}')
print(f'Train Data:   {batchC["train"]["x"].shape}')
print(f'Test Labels:  {batchC["test"]["Y"].shape}')
print(f'Test Data:    {batchC["test"]["x"].shape}')

Train Labels: (50000,)
Train Data:   (50000, 3072)
Test Labels:  (10000,)
Test Data:    (10000, 3072)


# SGDClassifier

In [22]:
# Create the classifier
sgdc = SGDClassifier(max_iter=1000)

In [None]:
# Train the classifier
sgdc.fit(batchC["train"]["x"], batchC["train"]["Y"])

In [None]:
# Check the train and test accuracy
sgdc_train_acc = sgdc.score(batchC["train"]["x"], batchC["train"]["Y"])
sgdc_test_acc = sgdc.score(batchC["test"]["x"], batchC["test"]["Y"])

In [None]:
# Get a report for the classification model
preds = sgdc.predict(batchC["train"]["x"])
classification_report(batchC["train"]["Y"], preds)

# Linear Regression

In [16]:
# Create the model
LR = sk.linear_model.LinearRegression()

In [17]:
# Fit the model
LR.fit(batchC["train"]["x"], batchC["train"]["Y"])

LinearRegression()

In [18]:
# Check the train and test accuracy
print(f'Train: {LR.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Test: {LR.score(batchC["test"]["x"], batchC["test"]["Y"])}')

Train: 0.15024781123171782
Test: 0.02634806717868443


# Decision Tree Regressor

In [16]:
# Create the models
DCR1 = DecisionTreeRegressor(max_depth=3)
DCR2 = DecisionTreeRegressor(max_depth=5)

In [17]:
# Fit the models
DCR1.fit(batchC["train"]["x"], batchC["train"]["Y"])
DCR2.fit(batchC["train"]["x"], batchC["train"]["Y"])

DecisionTreeRegressor(max_depth=5)

In [20]:
# Check the train and test accuracy
print(f'Model 1 Train: {DCR1.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 1 Test: {DCR1.score(batchC["test"]["x"], batchC["test"]["Y"])}')
print(f'Model 2 Train: {DCR2.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 2 Test: {DCR2.score(batchC["test"]["x"], batchC["test"]["Y"])}')

Model 1 Train: 0.0678039144532292
Model 1 Test: 0.07334185391129788
Model 2 Train: 0.10136146644358612
Model 2 Test: 0.08654625896435797


# Random Forest Classifier

In [17]:
# Create the models
RFC1 = RandomForestClassifier(n_estimators=100)
RFC2 = RandomForestClassifier(n_estimators=200)

In [None]:
# Fit the models
RFC1.fit(batchC["train"]["x"], batchC["train"]["Y"])
RFC2.fit(batchC["train"]["x"], batchC["train"]["Y"])

In [None]:
# Check the train and test accuracy
print(f'Model 1 Train: {RFC1.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 1 Test: {RFC1.score(batchC["test"]["x"], batchC["test"]["Y"])}')
print(f'Model 2 Train: {RFC2.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 2 Test: {RFC2.score(batchC["test"]["x"], batchC["test"]["Y"])}')

# KNeighbors Classifier

In [16]:
# Create the models
KNN1 = KNeighborsClassifier(n_neighbors=5, leaf_size=30)
KNN2 = KNeighborsClassifier(n_neighbors=5, leaf_size=100)
KNN3 = KNeighborsClassifier(n_neighbors=10, leaf_size=30)
KNN4 = KNeighborsClassifier(n_neighbors=10, leaf_size=100)

In [17]:
# Fit the models
KNN1.fit(batchC["train"]["x"], batchC["train"]["Y"])
KNN2.fit(batchC["train"]["x"], batchC["train"]["Y"])
KNN3.fit(batchC["train"]["x"], batchC["train"]["Y"])
KNN4.fit(batchC["train"]["x"], batchC["train"]["Y"])

KNeighborsClassifier(leaf_size=100, n_neighbors=10)

In [None]:
# Check the train and test accuracy
print(f'Model 1 Train: {KNN1.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 1 Test: {KNN1.score(batchC["test"]["x"], batchC["test"]["Y"])}')
print(f'Model 2 Train: {KNN2.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 2 Test: {KNN2.score(batchC["test"]["x"], batchC["test"]["Y"])}')
print(f'Model 3 Train: {KNN3.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 3 Test: {KNN3.score(batchC["test"]["x"], batchC["test"]["Y"])}')
print(f'Model 4 Train: {KNN4.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model 4 Test: {KNN4.score(batchC["test"]["x"], batchC["test"]["Y"])}')

Model 1 Train: 0.50534
Model 1 Test: 0.3398
Model 2 Train: 0.50534
Model 2 Test: 0.3398
Model 3 Train: 0.43236


# OvR Classifier

In [18]:
# Create and fit the model
OvR = OneVsRestClassifier(SVC().fit(batchC["train"]["x"], batchC["train"]["Y"]))

In [22]:
# Fit the model
OvR.fit(batchC["train"]["x"], batchC["train"]["Y"])

OneVsRestClassifier(estimator=SVC())

In [19]:
# Check the train and test accuracy
print(f'Model Train: {OvR.score(batchC["train"]["x"], batchC["train"]["Y"])}')
print(f'Model Test: {OvR.score(batchC["test"]["x"], batchC["test"]["Y"])}')

NotFittedError: This OneVsRestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.