In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

X = np.load('x_digits.npy')
Y = np.load('y_digits.npy')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train = X_train.reshape([-1, 129, 71, 1])
X_test = X_test.reshape([-1, 129, 71, 1])



Okay, we have the data separated, we will split the train for the validation inside the neural network.

Let's start with very simple fully connected NN (Neural Network). We will have just three hidden layers, with activation ReLu and Adam optimizer. For all of the NNs we will use 'sparse_categorical_crossentropy' loss function as this is a classification problem. The NN will output 10 different probablities of the given input being a specific number. More about this later.

In [2]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])

history = model.fit(X_train, Y_train, epochs = 20, validation_split = 0.2)

Epoch 1/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.2723 - loss: 5.5723 - val_accuracy: 0.4243 - val_loss: 2.2670
Epoch 2/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.4500 - loss: 1.9334 - val_accuracy: 0.5590 - val_loss: 1.5598
Epoch 3/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.5459 - loss: 1.5884 - val_accuracy: 0.6034 - val_loss: 1.3059
Epoch 4/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.5876 - loss: 1.3821 - val_accuracy: 0.6388 - val_loss: 1.1997
Epoch 5/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6557 - loss: 1.0954 - val_accuracy: 0.6101 - val_loss: 1.2124
Epoch 6/20
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6820 - loss: 0.9983 - val_accuracy: 0.6651 - val_loss: 1.0661
Epoch 7/20
[1m713/713[0m 

74% for the validation, nice. However, we can see that the difference in accuracy and loss between train and validation is significant which may suggest overfitting. Let's use AdamW as optimizer to apply some built-in regularization.
           
(Comment from ChatGPT about the usage of AdamW:      
Better generalization: The direct weight decay helps prevent weights from growing too large during training. By penalizing large weights, the model is encouraged to find simpler solutions, improving generalization to the validation/test sets and reducing overfitting.

Reduces overfitting impact: Proper weight decay leads to better regularization, helping the model avoid fitting to noise or irrelevant patterns in the training data.)

In [3]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.005)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2)

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.2802 - loss: 7.4380 - val_accuracy: 0.4408 - val_loss: 2.0133
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.4399 - loss: 2.0635 - val_accuracy: 0.4731 - val_loss: 1.8141
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5234 - loss: 1.5710 - val_accuracy: 0.5734 - val_loss: 1.4581
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5783 - loss: 1.4064 - val_accuracy: 0.5983 - val_loss: 1.3549
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.6087 - loss: 1.3209 - val_accuracy: 0.6315 - val_loss: 1.1808
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6397 - loss: 1.1375 - val_accuracy: 0.6262 - val_loss: 1.1963
Epoch 7/30
[1m713/713[0m 

Well, it did sligthly better but it is still overfitting, let's try increasing the weight decay two times. 

In [4]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.01)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2)

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.2476 - loss: 5.8731 - val_accuracy: 0.3750 - val_loss: 2.3753
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.3607 - loss: 2.3506 - val_accuracy: 0.3973 - val_loss: 1.8316
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.4198 - loss: 1.8850 - val_accuracy: 0.4526 - val_loss: 1.7538
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.4711 - loss: 1.6564 - val_accuracy: 0.4831 - val_loss: 1.6498
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.4909 - loss: 1.5914 - val_accuracy: 0.5155 - val_loss: 1.5205
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.5246 - loss: 1.4292 - val_accuracy: 0.5318 - val_loss: 1.3765
Epoch 7/30
[1m713/713[0m

Okay, this has slightly corrected the relation between loss and val-loss, but the relation between accuracy and val-accuracy is still the same. The only difference is that now we might underfitting as the overall acurracy has dipped 10%. Let's try meeting halfway and set the weight decay to 0.0075.

In [5]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2)

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.2411 - loss: 5.9462 - val_accuracy: 0.4296 - val_loss: 1.9344
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.4228 - loss: 1.9505 - val_accuracy: 0.4461 - val_loss: 1.7504
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.4497 - loss: 1.8743 - val_accuracy: 0.4450 - val_loss: 1.8845
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.4838 - loss: 1.6717 - val_accuracy: 0.4504 - val_loss: 1.7301
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5024 - loss: 1.5194 - val_accuracy: 0.5075 - val_loss: 1.5381
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.5300 - loss: 1.3941 - val_accuracy: 0.5483 - val_loss: 1.3225
Epoch 7/30
[1m713/713[0m 

Let's compare to see which weight decay value performs the best:   

* 0: accuracy: 0.8227 - loss: 0.5262 - val_accuracy: 0.7481 - val_loss: 0.8269
* 0.005: accuracy: 0.8347 - loss: 0.4720 - val_accuracy: 0.7693 - val_loss: 0.7910
* 0.0075: accuracy: 0.7592 - loss: 0.6894 - val_accuracy: 0.6823 - val_loss: 1.0159
* 0.01: accuracy: 0.7401 - loss: 0.7571 - val_accuracy: 0.6764 - val_loss: 1.0275

We can clearly see that 0.005 performs the best, gives the best accuracy and lowest loss values. Let's go with this one then. Let's add some callbacks to see if we can still reduce some overfitting. We are going to add early stopping and Learning rate scheduler. Let's remove one layer too. 

In [None]:
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

In [8]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(125, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(75, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(40, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - accuracy: 0.2557 - loss: 6.1764 - val_accuracy: 0.4599 - val_loss: 1.6256 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.4882 - loss: 1.5545 - val_accuracy: 0.5581 - val_loss: 1.3270 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.6032 - loss: 1.1675 - val_accuracy: 0.6457 - val_loss: 1.0175 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.6658 - loss: 0.9630 - val_accuracy: 0.6820 - val_loss: 0.9267 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.7091 - loss: 0.8782 - val_accuracy: 0.7101 - val_loss: 0.8627 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

Cool! Adding callbacks improved the situation as well as adding some extra layers with number of neurons formed in a funnel. We have actually changed the weight decay to 0.0075 as it performs better with more layers now. Let's see if we can reduce the number of neurons.

In [9]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(150, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(50, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(25, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.2786 - loss: 5.3940 - val_accuracy: 0.4106 - val_loss: 1.8056 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.5060 - loss: 1.4617 - val_accuracy: 0.5887 - val_loss: 1.2743 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6222 - loss: 1.1256 - val_accuracy: 0.6022 - val_loss: 1.1517 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6765 - loss: 0.9525 - val_accuracy: 0.6573 - val_loss: 1.0550 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.6949 - loss: 0.9280 - val_accuracy: 0.7108 - val_loss: 0.8545 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

Okay, this didn't work out well. Let's go other way, add two more layers and increase the number of neurons.

In [10]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(400, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(300, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(150, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(50, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.2368 - loss: 6.5989 - val_accuracy: 0.5385 - val_loss: 1.4101 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 17ms/step - accuracy: 0.5508 - loss: 1.3667 - val_accuracy: 0.6718 - val_loss: 1.0056 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 18ms/step - accuracy: 0.6528 - loss: 1.0452 - val_accuracy: 0.7095 - val_loss: 0.8799 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 18ms/step - accuracy: 0.7201 - loss: 0.8362 - val_accuracy: 0.7279 - val_loss: 0.8394 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 19ms/step - accuracy: 0.7496 - loss: 0.7459 - val_accuracy: 0.7557 - val_loss: 0.7209 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

Now that's definitely overfitting. Let's reduce the layers and number of neurons to be slighty above the two steps before.

In [11]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(250, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(150, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(50, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.2486 - loss: 4.8609 - val_accuracy: 0.4829 - val_loss: 1.5324 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.5277 - loss: 1.4197 - val_accuracy: 0.6173 - val_loss: 1.1616 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.6406 - loss: 1.0830 - val_accuracy: 0.7051 - val_loss: 0.8908 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.7044 - loss: 0.8897 - val_accuracy: 0.7399 - val_loss: 0.8219 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.7423 - loss: 0.7698 - val_accuracy: 0.7637 - val_loss: 0.7699 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

Still not good, let's try early stopping on val_accuracy.

In [12]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(250, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(150, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(100, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(50, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.2793 - loss: 6.5954 - val_accuracy: 0.5103 - val_loss: 1.6185 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.5402 - loss: 1.5098 - val_accuracy: 0.6494 - val_loss: 1.0709 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.6444 - loss: 1.0674 - val_accuracy: 0.6801 - val_loss: 0.9679 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.7074 - loss: 0.8786 - val_accuracy: 0.6946 - val_loss: 0.9500 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.7354 - loss: 0.7921 - val_accuracy: 0.7395 - val_loss: 0.7604 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

Also doesn't work, let's add BatchNormalization layer to the NN with 4 hidden layers.

(ChatGPT: Batch Normalization: Add batch normalization layers to stabilize and accelerate training, reduce overfitting, and help with generalization. )

In [13]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(125, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(75, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(40, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.0075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.3785 - loss: 1.7941 - val_accuracy: 0.5839 - val_loss: 1.1941 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.6607 - loss: 0.9796 - val_accuracy: 0.7200 - val_loss: 0.8295 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.7357 - loss: 0.7563 - val_accuracy: 0.7181 - val_loss: 0.8450 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.7841 - loss: 0.6305 - val_accuracy: 0.7437 - val_loss: 0.7733 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.8107 - loss: 0.5510 - val_accuracy: 0.7557 - val_loss: 0.7417 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

Still 12% difference, overifitting, not satisfied. The batchNormalization did not work well. Let's try droput layer.

In [21]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Input(shape = [129, 71]))
model.add(tf.keras.layers.Flatten()) #Flattens a multidim input to 1D so it can be used for Dense layers
model.add(tf.keras.layers.Dense(200, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(125, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(75, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(40, activation = "relu", kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation = "softmax"))

optimizer = tf.keras.optimizers.AdamW(learning_rate = 0.001, weight_decay=0.075)
model.compile(loss = "sparse_categorical_crossentropy", optimizer = optimizer, metrics = ["accuracy"])
early_stopping_cb= tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler= tf.keras.callbacks.ReduceLROnPlateau(factor = 0.5, patience= 2)

history = model.fit(X_train, Y_train, epochs = 30, validation_split = 0.2, callbacks=[early_stopping_cb, lr_scheduler])

Epoch 1/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.1010 - loss: 4.5751 - val_accuracy: 0.1256 - val_loss: 2.3335 - learning_rate: 0.0010
Epoch 2/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.1213 - loss: 2.2875 - val_accuracy: 0.1303 - val_loss: 2.2580 - learning_rate: 0.0010
Epoch 3/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.1254 - loss: 2.2586 - val_accuracy: 0.1584 - val_loss: 2.2084 - learning_rate: 0.0010
Epoch 4/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1460 - loss: 2.2249 - val_accuracy: 0.1870 - val_loss: 2.1275 - learning_rate: 0.0010
Epoch 5/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.1834 - loss: 2.1224 - val_accuracy: 0.2205 - val_loss: 2.0117 - learning_rate: 0.0010
Epoch 6/30
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

YEAAAAH!! Randomly droping neurons in the shallow layers was the key for finding model which doesn't overfit. The model runs longer now, but it performs better. This is our FINAL model for fully connected NN.

In [22]:
print("Loss + accuracy on train data: {}".format(model.evaluate(X_train, Y_train)))
print("Loss + accuracy on test data: {}".format(model.evaluate(X_test, Y_test)))

[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9311 - loss: 0.2181
Loss + accuracy on train data: [0.2505785822868347, 0.9210286140441895]
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8593 - loss: 0.4124
Loss + accuracy on test data: [0.425045371055603, 0.8575838208198547]
