# The Fashion MNIST Dataset

### Loading the data

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from PIL import Image
from helpers import display_dataframe # This is a custom function I wrote to display the images within the pandas dataframe.

In [2]:
# Loading the data
(Xtr, ytr), (Xte, yte) = tf.keras.datasets.fashion_mnist.load_data()

In [3]:
label_dict = {
    0:'Top',        1:'Trouser',
    2:'Pullover',   3:'Dress',
    4:'Coat',       5:'Sandal',
    6:'Shirt',      7:'Sneaker',
    8:'Bag',        9:'Boot'}

def generate_dataframe(X, y, label_dict=label_dict):
    data = pd.DataFrame([{'image':Image.fromarray(xi), 'array':xi.ravel(), 'target':yi} for xi, yi in zip(X,y)])
    data['target'] = data['target'].map(label_dict)
    return data

data = generate_dataframe(Xtr, ytr)

In [4]:
data.head()

Unnamed: 0,array,image,target
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",<PIL.Image.Image image mode=L size=28x28 at 0x...,Boot
1,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 41, 188, 103, 5...",<PIL.Image.Image image mode=L size=28x28 at 0x...,Top
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 118, 24, 0, 0,...",<PIL.Image.Image image mode=L size=28x28 at 0x...,Top
3,"[0, 0, 0, 0, 0, 0, 0, 0, 33, 96, 175, 156, 64,...",<PIL.Image.Image image mode=L size=28x28 at 0x...,Dress
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",<PIL.Image.Image image mode=L size=28x28 at 0x...,Top


In [5]:
# The helper function display_da
display_dataframe(data.head())

Unnamed: 0,array,image,target
0,"A numpy array of shape (784,) and dtype uint8",,Boot
1,"A numpy array of shape (784,) and dtype uint8",,Top
2,"A numpy array of shape (784,) and dtype uint8",,Top
3,"A numpy array of shape (784,) and dtype uint8",,Dress
4,"A numpy array of shape (784,) and dtype uint8",,Top


In [6]:
data.shape

(60000, 3)

# Generating a "Simple" version of this dataset

In [7]:
from sklearn.model_selection import train_test_split

# 60,000 are a few too many images for most computers, let's shring that down a bit
# The goal is to train on 4,000 images and test on 1,000

data_small = data.sample(5000, random_state=1).reset_index(drop=True)

X = pd.DataFrame({i:x.ravel() for i,x in enumerate(data_small['array'])}).T
y = data_small['target']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

# Finding the baseline

In [8]:
# First (and pretty much foremost) find the baseline:
y_test.value_counts(normalize=True)

Trouser     0.110
Shirt       0.109
Sandal      0.108
Sneaker     0.106
Bag         0.102
Dress       0.101
Coat        0.097
Pullover    0.094
Boot        0.092
Top         0.081
Name: target, dtype: float64

### The baseline is 11%
Any accuracy below 11% is worse than random, but even a one-in-five correct prediction represents a 100% improvement over random.

# Basic Model

- This model simply uses a 3-layer decision tree to predict on the images


In [9]:
import warnings
warnings.filterwarnings('ignore')
    
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline


model = Pipeline([('dtc', DecisionTreeClassifier(max_depth=3))])

In [10]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.477

# Not so basic Model

- This model fist scales the data so that each of the 784 pixels' brightness has mean 0 and standard deviation 1
- It then reduces the 784 dimensional data to 20 dimensions by using linear PCA
- Lastly it implements a 3-layer decision tree

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

model = Pipeline([('scaler', StandardScaler()),
                  ('pca', PCA(n_components=20)),
                  ('dtc', DecisionTreeClassifier(max_depth=3))])

In [12]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.483

# A Non-basic Model

- This model fist scales the data so that each of the 784 pixels' brightness has mean 0 and standard deviation 1
- It then reduces the 784 dimensional data to 20 dimensions by using linear PCA
- It then expands these 20 features into 8,000 (i.e. $20^3$) by considering all polynomial combinations (degree=3)
- It then reduces the 8,000 dimensional to 20 dimensions by using linear PCA
- Lastly it implements a 3-layer decision tree

In [13]:
from sklearn.preprocessing import PolynomialFeatures

model = Pipeline([('scaler', StandardScaler()),
                  ('pca1', PCA(n_components=20)),
                  ('poly', PolynomialFeatures(degree=3)),
                  ('pca2', PCA(n_components=20)),
                  ('dtc', DecisionTreeClassifier(max_depth=3))])

In [14]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.542

# Your model

In [16]:
# here