In [1]:
%matplotlib inline

# Load the modules
import pickle
import math

import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt

# Reload the data
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
  pickle_data = pickle.load(f)
  train_features = pickle_data['train_dataset']
  train_labels = pickle_data['train_labels']
  valid_features = pickle_data['valid_dataset']
  valid_labels = pickle_data['valid_labels']
  test_features = pickle_data['test_dataset']
  test_labels = pickle_data['test_labels']
  del pickle_data  # Free up memory


print('Data and modules loaded.')

2025-11-03 08:22:46.863188: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762154566.908829   15797 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762154566.922576   15797 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762154566.960404   15797 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762154566.960450   15797 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1762154566.960454   15797 computation_placer.cc:177] computation placer alr

Data and modules loaded.


In [2]:
# Data has been normalized 
total_samples = len(train_features)

# Parameters
learning_rate = 0.008
num_output = 10
epochs = 100
batch_size = 500
steps_per_epoch = int(np.ceil(total_samples / batch_size))

In [3]:
# Cleaning Data
#valid_labels, test_labels = np.array(valid_labels, np.int64), np.array(test_labels, np.int64)
train_data = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
val_features, test_features = np.array(valid_features, np.float32), np.array(test_features, np.float32)

# Data has been normalized

I0000 00:00:1762154580.821260   15797 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6096 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2070 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5


In [4]:
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [5]:
features = train_features.shape[1]

In [6]:
W = tf.Variable(tf.random.normal([features, num_output], name="weights1"))
B = tf.Variable(tf.zeros([num_output], name="bias1"))

optimizer = tf.optimizers.SGD(learning_rate)

# Model
class MyModel(tf.Module):
    def __call__(self, X):
        return tf.nn.softmax(tf.add(tf.matmul(X, W), B))

def cross_entropy(y_pred, y_true):
    # It has been one-hot encoded before storing as pickle
    #y_true = tf.one_hot(y_true, depth=num_output) 
    y_pred = tf.clip_by_value(y_pred, 1e-9, 1.)
    return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred), 1))

def accuracy(y_pred, y_true):
    if len(y_true.shape) > 1 and y_true.shape[1] > 1:
        y_true = tf.argmax(y_true, axis=1)
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

def run_optimizer(X, Y):
    with tf.GradientTape() as g:
        logit = model(X)
        loss = cross_entropy(logit, Y)

    gradients = g.gradient(loss, [W, B])
    optimizer.apply_gradients(zip(gradients, [W, B]))
    return None

def batch_data(X, Y, batch_size):
    output_data = []
    sample_size = len(X)
    for step in range(0, sample_size, batch_size):
        start = batch_size * step
        end = batch_size + start
        batch_X = X[start:end]
        batch_Y = Y[start:end]
        yield batch_X, batch_Y

model=MyModel()

checkpoint = tf.train.Checkpoint(model = model)
checkpoint.save("./checkpoints/mymodel")
manager = tf.train.CheckpointManager(checkpoint, "./checkpoints", max_to_keep=3)

In [7]:
# Training
for epoch in range(1, epochs + 1):
    for step, (batch_X, batch_Y) in enumerate(train_data.take(steps_per_epoch), 1):
        run_optimizer(batch_X, batch_Y)

    val_pred = model(valid_features)
    val_loss = cross_entropy(val_pred, valid_labels)
    val_acc = accuracy(val_pred, valid_labels)
    train_pred = model(batch_X)
    train_loss = cross_entropy(train_pred, batch_Y)
    train_acc = accuracy(train_pred, batch_Y)
    manager.save()
    
    print(f"Epoch: {epoch}, train_loss {train_loss}, train_accuracy {train_acc}, val_loss {val_loss}, val_accuracy {val_acc}")

2025-11-03 08:23:08.869182: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 1, train_loss 12.522168159484863, train_accuracy 0.1720000058412552, val_loss 12.631797790527344, val_accuracy 0.17319999635219574


2025-11-03 08:23:14.286171: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 2, train_loss 9.753608703613281, train_accuracy 0.23999999463558197, val_loss 10.080270767211914, val_accuracy 0.21306666731834412
Epoch: 3, train_loss 7.980541229248047, train_accuracy 0.27399998903274536, val_loss 7.923323154449463, val_accuracy 0.27266666293144226


2025-11-03 08:23:25.583201: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 4, train_loss 6.340033054351807, train_accuracy 0.3659999966621399, val_loss 6.648892402648926, val_accuracy 0.3574666678905487
Epoch: 5, train_loss 5.933877468109131, train_accuracy 0.44200000166893005, val_loss 5.985538005828857, val_accuracy 0.4365333318710327
Epoch: 6, train_loss 5.2399067878723145, train_accuracy 0.49399998784065247, val_loss 5.596011638641357, val_accuracy 0.4763999879360199
Epoch: 7, train_loss 5.588754177093506, train_accuracy 0.5019999742507935, val_loss 5.330770015716553, val_accuracy 0.5063999891281128


2025-11-03 08:23:41.802731: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 8, train_loss 5.226409435272217, train_accuracy 0.5180000066757202, val_loss 5.133976459503174, val_accuracy 0.5306666493415833
Epoch: 9, train_loss 5.433060169219971, train_accuracy 0.5120000243186951, val_loss 4.976741790771484, val_accuracy 0.5455999970436096
Epoch: 10, train_loss 4.473424911499023, train_accuracy 0.5740000009536743, val_loss 4.840924263000488, val_accuracy 0.5580000281333923
Epoch: 11, train_loss 5.08305549621582, train_accuracy 0.5619999766349792, val_loss 4.700588226318359, val_accuracy 0.5678666830062866
Epoch: 12, train_loss 3.4731485843658447, train_accuracy 0.628000020980835, val_loss 4.48259162902832, val_accuracy 0.5767999887466431
Epoch: 13, train_loss 3.8564810752868652, train_accuracy 0.578000009059906, val_loss 3.893706798553467, val_accuracy 0.5734666585922241
Epoch: 14, train_loss 3.2322206497192383, train_accuracy 0.5879999995231628, val_loss 3.4596240520477295, val_accuracy 0.5899999737739563
Epoch: 15, train_loss 2.8741748332977295, train_ac

2025-11-03 08:24:18.794506: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 16, train_loss 2.864553928375244, train_accuracy 0.6539999842643738, val_loss 3.0723369121551514, val_accuracy 0.6285333037376404
Epoch: 17, train_loss 2.5251049995422363, train_accuracy 0.6700000166893005, val_loss 2.9617342948913574, val_accuracy 0.6386666893959045
Epoch: 18, train_loss 2.6259777545928955, train_accuracy 0.6779999732971191, val_loss 2.8771626949310303, val_accuracy 0.6481333374977112
Epoch: 19, train_loss 2.806623697280884, train_accuracy 0.6639999747276306, val_loss 2.807987928390503, val_accuracy 0.6570666432380676
Epoch: 20, train_loss 2.811281442642212, train_accuracy 0.6899999976158142, val_loss 2.7500667572021484, val_accuracy 0.6634666919708252
Epoch: 21, train_loss 2.2527706623077393, train_accuracy 0.7120000123977661, val_loss 2.7000057697296143, val_accuracy 0.6701333522796631
Epoch: 22, train_loss 2.593998432159424, train_accuracy 0.6840000152587891, val_loss 2.654911994934082, val_accuracy 0.6751999855041504
Epoch: 23, train_loss 2.5726823806762695

2025-11-03 08:25:36.942251: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 32, train_loss 2.405752182006836, train_accuracy 0.6880000233650208, val_loss 2.366525411605835, val_accuracy 0.7078666687011719
Epoch: 33, train_loss 2.4867794513702393, train_accuracy 0.7020000219345093, val_loss 2.3459396362304688, val_accuracy 0.7097333073616028
Epoch: 34, train_loss 2.425861358642578, train_accuracy 0.7099999785423279, val_loss 2.326657295227051, val_accuracy 0.7116000056266785
Epoch: 35, train_loss 1.8550870418548584, train_accuracy 0.7639999985694885, val_loss 2.3084332942962646, val_accuracy 0.7134666442871094
Epoch: 36, train_loss 2.468632698059082, train_accuracy 0.7200000286102295, val_loss 2.2906339168548584, val_accuracy 0.713866651058197
Epoch: 37, train_loss 1.9633485078811646, train_accuracy 0.75, val_loss 2.2739338874816895, val_accuracy 0.7161333560943604
Epoch: 38, train_loss 2.1409718990325928, train_accuracy 0.7080000042915344, val_loss 2.2576684951782227, val_accuracy 0.7177333235740662
Epoch: 39, train_loss 2.3442511558532715, train_accura

2025-11-03 08:28:15.113735: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch: 64, train_loss 1.8490206003189087, train_accuracy 0.7519999742507935, val_loss 1.9679304361343384, val_accuracy 0.7411999702453613
Epoch: 65, train_loss 1.715474009513855, train_accuracy 0.7480000257492065, val_loss 1.9599409103393555, val_accuracy 0.7414666414260864
Epoch: 66, train_loss 1.7911757230758667, train_accuracy 0.7419999837875366, val_loss 1.9521111249923706, val_accuracy 0.7423999905586243
Epoch: 67, train_loss 1.7309987545013428, train_accuracy 0.7459999918937683, val_loss 1.9445120096206665, val_accuracy 0.7432000041007996
Epoch: 68, train_loss 2.1737499237060547, train_accuracy 0.7120000123977661, val_loss 1.936683177947998, val_accuracy 0.7440000176429749
Epoch: 69, train_loss 2.0930395126342773, train_accuracy 0.7480000257492065, val_loss 1.9295439720153809, val_accuracy 0.7442666888237
Epoch: 70, train_loss 1.998889446258545, train_accuracy 0.7419999837875366, val_loss 1.9221898317337036, val_accuracy 0.7442666888237
Epoch: 71, train_loss 1.6590121984481812, t

In [10]:
test_pred = model(test_features)
test_loss = cross_entropy(test_pred, test_labels)
test_acc = accuracy(test_pred, test_labels)
print(f"Test_loss {test_loss}, Test_accuracy {test_acc}")

Test_loss 1.0555355548858643, Test_accuracy 0.8327999711036682
