In [1]:
import tensorflow
import pandas as pd
import numpy as np
np.random.seed(1212)
import keras
from keras.models import Model
from keras.layers import *
from keras import optimizers
from tensorflow.keras.utils import to_categorical

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_features = df_train.iloc[:,1:785]
df_label = df_train.iloc[:,0]

X_test = df_test.iloc[:,0:784]

print(X_test.shape)

(28000, 784)


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_cv, y_train, y_cv = train_test_split(df_features, df_label, test_size = 0.2, random_state = 1212)
# The (cv) is the training and testing *labels*
X_train = X_train.to_numpy().reshape(33600,784)
X_cv = X_cv.to_numpy().reshape(8400,784)
X_test = X_test.to_numpy().reshape(28000,784)

In [8]:
print((min(X_train[1]), max(X_train[1]))) # Min and max values of the pixels in the training set

(0, 255)


In [9]:
# Normalizing the features of the training data
X_train = X_train.astype("float32") 
X_cv = X_cv.astype("float32") 
X_test = X_test.astype("float32")
X_train /= 255 
X_cv /= 255 
X_test /= 255

# Convert labels to (OneHot Encoded)
num_digits = 10
y_train = tensorflow.keras.utils.to_categorical(y_train, num_digits)
y_cv = tensorflow.keras.utils.to_categorical(y_cv, num_digits)
# We want to encode the (y) object into a number of classes to be divided into (num_digits)
# Basically, you want to represent the (y) object into a numpy array and divide it into labels

In [10]:
# Printing two examples of labels after conversion
print(y_train[0])  # The number two  #[] is the index position
print(y_train[3])  # The number seven

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


In [11]:
# Input parameters for our functional model
n_input = 784  # Number of features for our model
n_hidden_1 = 300  # Number of nodes per layer
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10   # Number of outputs

In [13]:
# Building the model (we will build several and compare outputs to find the best model)
Inp = Input(shape = (784,))  # There are 784 inputs or nodes to the model
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp) # 300 nodes and connected to input layer
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) # 100 nodes and connected to previous layer
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) # 100 nodes and connected to previous layer
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x) # 200 nodes and connected to previous layer
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x) # ten outputs  and connected to previous layer
# Softmax does the sigmoid function for each layer and all output probabilities should add up to 1

In [15]:
# Our model has six layers, one input, four hidden, and one output layer
model = Model(Inp, output)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_layer_1 (Dense)      (None, 300)               235500    
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               30100     
                                                                 
 Hidden_layer_3 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_4 (Dense)      (None, 200)               20200     
                                                                 
 Output_Layer (Dense)        (None, 10)                2010      
                                                                 
Total params: 297,910
Trainable params: 297,910
Non-trainab

In [19]:
# Inserting the hyperparameters
learning_rate = .1
training_epochs = 20  # Number of iterations to be run
batch_size = 100
sgd = tensorflow.keras.optimizers.SGD(lr = learning_rate) # Stochastic gradient descent(SGD)

In [20]:
# Stochastic gradient descent is our optimizing methodology
model.compile(loss = "categorical_crossentropy",
             optimizer = "sgd",
             metrics = ["accuracy"])

In [22]:
history1 = model.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 17s - loss: 1.8762 - accuracy: 0.4937 - val_loss: 1.0044 - val_accuracy: 0.7692 - 17s/epoch - 50ms/step
Epoch 2/20
336/336 - 3s - loss: 0.6343 - accuracy: 0.8374 - val_loss: 0.4577 - val_accuracy: 0.8755 - 3s/epoch - 8ms/step
Epoch 3/20
336/336 - 3s - loss: 0.4092 - accuracy: 0.8820 - val_loss: 0.3667 - val_accuracy: 0.8937 - 3s/epoch - 8ms/step
Epoch 4/20
336/336 - 3s - loss: 0.3402 - accuracy: 0.9021 - val_loss: 0.3168 - val_accuracy: 0.9100 - 3s/epoch - 8ms/step
Epoch 5/20
336/336 - 3s - loss: 0.3009 - accuracy: 0.9127 - val_loss: 0.3024 - val_accuracy: 0.9111 - 3s/epoch - 8ms/step
Epoch 6/20
336/336 - 3s - loss: 0.2733 - accuracy: 0.9200 - val_loss: 0.2798 - val_accuracy: 0.9190 - 3s/epoch - 8ms/step
Epoch 7/20
336/336 - 3s - loss: 0.2512 - accuracy: 0.9263 - val_loss: 0.2572 - val_accuracy: 0.9236 - 3s/epoch - 8ms/step
Epoch 8/20
336/336 - 3s - loss: 0.2331 - accuracy: 0.9311 - val_loss: 0.2398 - val_accuracy: 0.9324 - 3s/epoch - 8ms/step
Epoch 9/20
336/336 - 

In [23]:
# Let's build another model (model itself is the same)
Inp = Input(shape = (784,)) 
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp) 
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) 
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) 
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x)
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x)

In [28]:
# This time we'll use a different optimizer to run the model(Adam)
adam = tensorflow.keras.optimizers.Adam(lr = learning_rate)

  super(Adam, self).__init__(name, **kwargs)


In [29]:
# Adam is our optimizing methodology
model2 = Model(Inp, output)
model2.compile(loss = "categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

In [30]:
history2 = model2.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.3352 - accuracy: 0.9004 - val_loss: 0.1667 - val_accuracy: 0.9513 - 4s/epoch - 13ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1201 - accuracy: 0.9625 - val_loss: 0.1059 - val_accuracy: 0.9674 - 3s/epoch - 8ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0841 - accuracy: 0.9730 - val_loss: 0.1218 - val_accuracy: 0.9649 - 3s/epoch - 8ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0569 - accuracy: 0.9819 - val_loss: 0.0995 - val_accuracy: 0.9710 - 3s/epoch - 8ms/step
Epoch 5/20
336/336 - 3s - loss: 0.0437 - accuracy: 0.9855 - val_loss: 0.0841 - val_accuracy: 0.9752 - 3s/epoch - 8ms/step
Epoch 6/20
336/336 - 3s - loss: 0.0352 - accuracy: 0.9885 - val_loss: 0.1058 - val_accuracy: 0.9695 - 3s/epoch - 8ms/step
Epoch 7/20
336/336 - 3s - loss: 0.0296 - accuracy: 0.9900 - val_loss: 0.0986 - val_accuracy: 0.9739 - 3s/epoch - 8ms/step
Epoch 8/20
336/336 - 3s - loss: 0.0293 - accuracy: 0.9910 - val_loss: 0.0899 - val_accuracy: 0.9774 - 3s/epoch - 8ms/step
Epoch 9/20
336/336 - 3s

In [31]:
# Model 2a (Different hyperparameters)
Inp = Input(shape = (784,)) 
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp) 
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) 
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) 
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x)
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x)

In [32]:
# Change the learning rate (hyperparameter)
learning_rate = 0.01
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)

In [37]:
model2a = Model(Inp, output)
model2a.compile(loss = "categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

In [34]:
history2a = model2a.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.3473 - accuracy: 0.8965 - val_loss: 0.1480 - val_accuracy: 0.9554 - 4s/epoch - 11ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1217 - accuracy: 0.9612 - val_loss: 0.1143 - val_accuracy: 0.9636 - 3s/epoch - 9ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0788 - accuracy: 0.9757 - val_loss: 0.1240 - val_accuracy: 0.9627 - 3s/epoch - 9ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0588 - accuracy: 0.9815 - val_loss: 0.1354 - val_accuracy: 0.9605 - 3s/epoch - 8ms/step
Epoch 5/20
336/336 - 3s - loss: 0.0440 - accuracy: 0.9857 - val_loss: 0.1313 - val_accuracy: 0.9614 - 3s/epoch - 8ms/step
Epoch 6/20
336/336 - 3s - loss: 0.0333 - accuracy: 0.9893 - val_loss: 0.1012 - val_accuracy: 0.9733 - 3s/epoch - 8ms/step
Epoch 7/20
336/336 - 3s - loss: 0.0321 - accuracy: 0.9899 - val_loss: 0.0940 - val_accuracy: 0.9733 - 3s/epoch - 8ms/step
Epoch 8/20
336/336 - 3s - loss: 0.0240 - accuracy: 0.9921 - val_loss: 0.1079 - val_accuracy: 0.9730 - 3s/epoch - 8ms/step
Epoch 9/20
336/336 - 3s

In [35]:
# Model  2b
Inp = Input(shape = (784,)) 
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp) 
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) 
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) 
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x)
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x)

In [36]:
learning_rate = 0.5
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)

In [38]:
model2b = Model(Inp, output)
model2b.compile(loss = "categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

In [39]:
history2b = model2b.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.3356 - accuracy: 0.8994 - val_loss: 0.1526 - val_accuracy: 0.9537 - 4s/epoch - 11ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1236 - accuracy: 0.9617 - val_loss: 0.1150 - val_accuracy: 0.9655 - 3s/epoch - 8ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0781 - accuracy: 0.9758 - val_loss: 0.1002 - val_accuracy: 0.9694 - 3s/epoch - 8ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0586 - accuracy: 0.9816 - val_loss: 0.0913 - val_accuracy: 0.9727 - 3s/epoch - 8ms/step
Epoch 5/20
336/336 - 3s - loss: 0.0451 - accuracy: 0.9858 - val_loss: 0.1018 - val_accuracy: 0.9683 - 3s/epoch - 10ms/step
Epoch 6/20
336/336 - 3s - loss: 0.0359 - accuracy: 0.9881 - val_loss: 0.0863 - val_accuracy: 0.9758 - 3s/epoch - 10ms/step
Epoch 7/20
336/336 - 3s - loss: 0.0273 - accuracy: 0.9914 - val_loss: 0.1066 - val_accuracy: 0.9729 - 3s/epoch - 9ms/step
Epoch 8/20
336/336 - 3s - loss: 0.0209 - accuracy: 0.9934 - val_loss: 0.1045 - val_accuracy: 0.9752 - 3s/epoch - 9ms/step
Epoch 9/20
336/336 - 

In [40]:
# Model 3 (Add an extra layer)
n_input = 784 
n_hidden_1 = 300  
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 100
n_hidden_5 = 200
num_digits = 10 

In [41]:
Inp = Input(shape = (784,)) 
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp) 
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) 
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) 
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x)
x = Dense(n_hidden_5, activation = "relu", name = "Hidden_layer_5")(x)
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x)

In [42]:
# Our model has seven layers, one input, five hidden, and one output layer
model3 = Model(Inp, output)
model3.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_layer_1 (Dense)      (None, 300)               235500    
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               30100     
                                                                 
 Hidden_layer_3 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_4 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_layer_5 (Dense)      (None, 200)               20200     
                                                                 
 Output_Layer (Dense)        (None, 10)                2010

In [43]:
learning_rate = 0.01
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)

In [44]:
model3.compile(loss = "categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

In [45]:
history3 = model3.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.3490 - accuracy: 0.8924 - val_loss: 0.1561 - val_accuracy: 0.9537 - 4s/epoch - 12ms/step
Epoch 2/20
336/336 - 3s - loss: 0.1254 - accuracy: 0.9613 - val_loss: 0.1189 - val_accuracy: 0.9645 - 3s/epoch - 8ms/step
Epoch 3/20
336/336 - 3s - loss: 0.0857 - accuracy: 0.9749 - val_loss: 0.1325 - val_accuracy: 0.9598 - 3s/epoch - 8ms/step
Epoch 4/20
336/336 - 3s - loss: 0.0613 - accuracy: 0.9817 - val_loss: 0.1125 - val_accuracy: 0.9665 - 3s/epoch - 9ms/step
Epoch 5/20
336/336 - 3s - loss: 0.0538 - accuracy: 0.9830 - val_loss: 0.1076 - val_accuracy: 0.9708 - 3s/epoch - 9ms/step
Epoch 6/20
336/336 - 3s - loss: 0.0425 - accuracy: 0.9862 - val_loss: 0.0962 - val_accuracy: 0.9744 - 3s/epoch - 8ms/step
Epoch 7/20
336/336 - 3s - loss: 0.0345 - accuracy: 0.9886 - val_loss: 0.0980 - val_accuracy: 0.9737 - 3s/epoch - 8ms/step
Epoch 8/20
336/336 - 3s - loss: 0.0299 - accuracy: 0.9905 - val_loss: 0.1199 - val_accuracy: 0.9680 - 3s/epoch - 8ms/step
Epoch 9/20
336/336 - 3s

In [46]:
# Building another model, but add the dropout parameter
n_input = 784  
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10 

In [47]:
# Adding dropout to the model(randomly drops nodes with each iteration)
Inp = Input(shape = (784,)) 
x = Dense(n_hidden_1, activation = "relu", name = "Hidden_layer_1")(Inp)
x = Dropout(0.3)(x)
x = Dense(n_hidden_2, activation = "relu", name = "Hidden_layer_2")(x) 
x = Dropout(0.3)(x)
x = Dense(n_hidden_3, activation = "relu", name = "Hidden_layer_3")(x) 
x = Dropout(0.3)(x)
x = Dense(n_hidden_4, activation = "relu", name = "Hidden_layer_4")(x)
output = Dense(num_digits, activation = "softmax", name = "Output_Layer")(x)

In [48]:
model4 = Model(Inp, output)
model4.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_layer_1 (Dense)      (None, 300)               235500    
                                                                 
 dropout (Dropout)           (None, 300)               0         
                                                                 
 Hidden_layer_2 (Dense)      (None, 100)               30100     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 Hidden_layer_3 (Dense)      (None, 100)               10100     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0   

In [49]:
model4.compile(loss = "categorical_crossentropy",
             optimizer = "adam",
             metrics = ["accuracy"])

In [50]:
history4 = model4.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = training_epochs,
                    verbose = 2,
                    validation_data = (X_cv, y_cv))

Epoch 1/20
336/336 - 4s - loss: 0.5740 - accuracy: 0.8163 - val_loss: 0.1814 - val_accuracy: 0.9485 - 4s/epoch - 12ms/step
Epoch 2/20
336/336 - 3s - loss: 0.2312 - accuracy: 0.9324 - val_loss: 0.1360 - val_accuracy: 0.9583 - 3s/epoch - 9ms/step
Epoch 3/20
336/336 - 3s - loss: 0.1733 - accuracy: 0.9503 - val_loss: 0.1194 - val_accuracy: 0.9658 - 3s/epoch - 9ms/step
Epoch 4/20
336/336 - 3s - loss: 0.1413 - accuracy: 0.9578 - val_loss: 0.1056 - val_accuracy: 0.9689 - 3s/epoch - 9ms/step
Epoch 5/20
336/336 - 3s - loss: 0.1266 - accuracy: 0.9630 - val_loss: 0.0942 - val_accuracy: 0.9726 - 3s/epoch - 9ms/step
Epoch 6/20
336/336 - 4s - loss: 0.1084 - accuracy: 0.9682 - val_loss: 0.0956 - val_accuracy: 0.9736 - 4s/epoch - 11ms/step
Epoch 7/20
336/336 - 4s - loss: 0.0995 - accuracy: 0.9716 - val_loss: 0.0908 - val_accuracy: 0.9746 - 4s/epoch - 11ms/step
Epoch 8/20
336/336 - 4s - loss: 0.0892 - accuracy: 0.9727 - val_loss: 0.0977 - val_accuracy: 0.9752 - 4s/epoch - 10ms/step
Epoch 9/20
336/336 -

In [51]:
# Making and showing the predictions on the test(validation) set. 
test_pred = pd.DataFrame(model4.predict(X_test, batch_size = 200))
test_pred = pd.DataFrame(test_pred.idxmax(axis = 1))
test_pred.index.name = "ImageID"
test_pred = test_pred.rename(columns = {0: "Label"}).reset_index()
test_pred["ImageID"] = test_pred["ImageID"] + 1

test_pred.head()

Unnamed: 0,ImageID,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3


In [52]:
test_pred.to_csv("mnist-submission.csv", index = False)

In [53]:
data = pd.read_csv("mnist-submission.csv")

In [54]:
data

Unnamed: 0,ImageID,Label
0,1,2
1,2,0
2,3,9
3,4,9
4,5,3
...,...,...
27995,27996,9
27996,27997,7
27997,27998,3
27998,27999,9
