In [48]:
#####################################################################################################
### Creator: Ricky Blake Manus
### Objective: Analysis of MNIST Dataset to accurately classify and predict the value of 
### hand written digits ranging from 0-9 from a 28x28 Pixel source image
###
### MNIST Dataset & Testing Data obtained from Kaggle.com
### Data and Competition can be found at https://www.kaggle.com/c/digit-recognizer/overview
####################################################################################################

In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from matplotlib import pyplot as plt
import tensorflow as tf

%matplotlib inline

In [50]:
###Notebook Options and Optimizations
pd.set_option('display.max_columns', 500)

In [None]:
### Import CSV Training and Testing Data, noting the layout and columns of the dataset
test_data = pd.read_csv('data-sets/test.csv')
train_data = pd.read_csv('data-sets/train.csv')

#Remove the labels and move them to a separate numpy array
labels = np.array(train_data.pop('label'))

train_data.head(5)

### The resulting dataframe contains 785 columns, and 42000 Rows
### 1 label column which contains the represented digit, moved to a numpy array
### 784 Pixel columns that represent the grayscale value of that pixel from 0-255 (inclusive)
### Each image is represented by 28x28 pixel image (784 Pixels)

In [None]:
###The First the I wish to try is to create a heatmap of the data 
###    so that I can have a greater visualization of it.
###############################################################

#Grab the sum of the values
sum_data = train_data.sum(axis=0)

#Pass the sum data to heat_data as a numpy array,
#Parse the data into the proper 28x28 pixel format
heat_data = sum_data.to_numpy()
heat_data = heat_data.reshape(28, 28) 

ax = sns.heatmap(heat_data, cmap='mako')

In [None]:
#Use LabelEncoder first, to properly format labels as OneHotEncoder needs a 2d Array
#Then use OneHotEncoder to format the labels again
labels = LabelEncoder().fit_transform(labels)[:, None]
labels = OneHotEncoder().fit_transform(labels).todense()

In [None]:
###Convert data to a scaled numpy array, and
###   reshape the data into a 2d 28x28 image
train_data = StandardScaler().fit_transform(np.float32(train_data.values))
train_data = train_data.reshape(-1, 28, 28, 1)

In [47]:
###Split the data into training and validation data with an 80-20 split
tr_data, v_data = train_data[:-8400], train_data[-8400:]
tr_labels, v_labels = labels[:-8400], labels[-8400:]
print('tr_data ' + str(tr_data.shape) + ' |||| v_data ' + str(v_data.shape) + 
      ' |||| train_data ' + str(train_data.shape) + ' |||| test_data ' + str(test_data.shape))

tr_data (33600, 784) |||| v_data (8400, 784) |||| train_data (42000, 784) |||| test_data (28000, 784)
