In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init

class NeuralNetwork:
    def __init__(self, neuronsperlayer, learning_rate):
        super().__init__()
        self.noflayers = len(neuronsperlayer)
        self.device = torch.device("cpu")
        layers = []
        
        for i in range(self.noflayers-1):
            layers.append(nn.Linear(neuronsperlayer[i], neuronsperlayer[i+1]))
            layers.append(nn.Sigmoid())
            
        self.net = nn.Sequential(*layers).to(self.device)

        def init_weights(m):
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight, gain=init.calculate_gain("sigmoid"))
                init.zeros_(m.bias)

        self.net.apply(init_weights)
        
        self.loss_function = nn.MSELoss(reduction="sum")
        self.optimizer = optim.SGD(self.net.parameters(), lr=learning_rate, momentum=0)

    def forward(self, input_layer): #Forward Pass through Neural Network
        input_layer = torch.as_tensor(input_layer, dtype=torch.float32, device=self.device)
        return self.net(input_layer)

    def loss(self, output_layer, target): #Loss Function
        return 0.5 * self.loss_function(output_layer, target)
                
    def learn(self, training_data, target, epochs, batch_size): #Stochastic Gradient Descent 
        X = torch.as_tensor(training_data, dtype=torch.float32, device=self.device)
        Y = torch.as_tensor(target, dtype=torch.float32, device=self.device)
        
        dataset = torch.utils.data.TensorDataset(X, Y)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        nofoutputs = Y.size(1) 
        
        for epoch in range(1, epochs + 1):
            running_loss = 0.0
            
            for x, y in loader:
                self.optimizer.zero_grad()
                predictions = self.net(x)
                loss = self.loss(predictions, y)  
                loss.backward()
                self.optimizer.step()
                running_loss += loss.item()

            if epoch % 500 == 0 or epoch == 1:
                average_loss = running_loss / (len(dataset) * nofoutputs)
                print(f"Epoch {epoch:4d}/{epochs} — Average Loss: {average_loss:.4f}")


In [2]:
import numpy as np
import pickle

with open("mnist.pkl", "rb") as f: #Loading in Data
    (train_X, train_y), (valid_X, valid_y), (test_X, test_y) = pickle.load(f, encoding="latin1")

#Not Using Validation Data so Stacking it with the Training Data
train_X = np.vstack([train_X, valid_X]) 
train_y = np.hstack([train_y, valid_y])

#Changing the y/Output Data as Vectors
#Ex: 3 --> [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
num_classes = 10
eye = np.eye(num_classes)
train_Y_1hot = eye[train_y]
test_Y_1hot = eye[test_y]

#Scales Inputs to be between 0 and 1 to Stop the Vanishing Gradient Problem with Sigmoid
train_X = train_X.astype(np.float32)
test_X = test_X .astype(np.float32)

NN = NeuralNetwork(neuronsperlayer=[784, 128, 64, 10], learning_rate=0.001)
NN.learn(train_X, train_Y_1hot, epochs=2000, batch_size=128)


Epoch    1/2000 — Average Loss: 0.0460
Epoch  500/2000 — Average Loss: 0.0011
Epoch 1000/2000 — Average Loss: 0.0004
Epoch 1500/2000 — Average Loss: 0.0003
Epoch 2000/2000 — Average Loss: 0.0002


In [3]:
with torch.no_grad(): 
    X_test_corrected = torch.from_numpy(test_X).float()
    logits = NN.forward(X_test_corrected) 
    predictions = logits.argmax(dim=1).cpu().numpy()

accuracy = (predictions == test_y).mean()
print(f"\nTest Accuracy: {accuracy*100:.2f}%")



Test Accuracy: 97.95%
