In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')
from typing import List
from tqdm import tqdm

# New section

In [44]:
# for dirname, _, filenames in os.walk(('/kaggle/input')):
#   for filename in filenames:
#     print(os.path.join(dirname, filename))

# Loading and Preprocessing the dataset

In [45]:
train_df=pd.read_csv('~/Programming/COMP551/COMP551_A3/dataset/sign_mnist_train.csv')
test_df=pd.read_csv('~/Programming/COMP551/COMP551_A3/dataset/sign_mnist_test.csv')

In [46]:
train_df.describe()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,...,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0,27455.0
mean,12.318813,145.419377,148.500273,151.247714,153.546531,156.210891,158.411255,160.472154,162.339683,163.954799,...,141.104863,147.495611,153.325806,159.125332,161.969259,162.736696,162.906137,161.966454,161.137898,159.824731
std,7.287552,41.358555,39.942152,39.056286,38.595247,37.111165,36.125579,35.016392,33.661998,32.651607,...,63.751194,65.512894,64.427412,63.708507,63.738316,63.444008,63.50921,63.298721,63.610415,64.396846
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,121.0,126.0,130.0,133.0,137.0,140.0,142.0,144.0,146.0,...,92.0,96.0,103.0,112.0,120.0,125.0,128.0,128.0,128.0,125.5
50%,13.0,150.0,153.0,156.0,158.0,160.0,162.0,164.0,165.0,166.0,...,144.0,162.0,172.0,180.0,183.0,184.0,184.0,182.0,182.0,182.0
75%,19.0,174.0,176.0,178.0,179.0,181.0,182.0,183.0,184.0,185.0,...,196.0,202.0,205.0,207.0,208.0,207.0,207.0,206.0,204.0,204.0
max,24.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [47]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27455 entries, 0 to 27454
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 164.4 MB


In [48]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7172 entries, 0 to 7171
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 43.0 MB


The train_df dataset consit of 1st column representing labels 1 to 24. The label is loaded in a separate dataframe called 'train_label' and the 'label' column is dropped from the original training dataframe which now consist of only 784 pixel values for each image.

In [49]:
# Drop the label column for the train_df
train_label = train_df['label']
trainset = train_df.drop(['label'], axis=1)
# Convert the dataframe to numpy array
X_train = trainset.values.astype(np.float64)

# Same thing for the test_df
test_label = test_df['label']
testset = test_df.drop(['label'], axis=1)
# Convert the dataframe to numpy array
X_test = testset.values.astype(np.float64)

One-hot encodding

In [50]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(train_label)
y_test = lb.fit_transform(test_label)

# Normalizatioin and Vectorization

In [51]:
X_train_mean = np.mean(X_train)
X_train_std = np.std(X_train, axis=0)
# For images, subtract a single data from all pixels
X_train -= X_train_mean
X_train /= X_train_std + 1e-5

X_test -= X_train_mean
X_test /= X_train_std + 1e-5

# Vectorization
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

2 hidden layers MLP

In [52]:
class NeuralNetlayer:
  def __init__(self):
    self.gradient = None
    self.parameters = None

  def forward(self, x):
    raise NotImplementedError

  def backward(self, gradient):
    raise NotImplementedError

In [53]:
class LinearLayer(NeuralNetlayer):
  def __init__(self, input_size, output_size):
    super().__init__()
    self.ni = input_size
    self.no = output_size
    # For a layer with ReLU activation
    # He initialization
    std = np.sqrt(2. / input_size)
    self.w = np.random.randn(output_size, input_size) * std
    self.b = np.random.randn(output_size)
    self.cur_input = None
    self.parameters = [self.w, self.b]

  def forward(self, x):
    self.cur_input = x
    return x @ self.w.T + self.b

  def backward(self, gradient):
    assert self.cur_input is not None, "Must call forward before backward!"
    dw = gradient.T @ self.cur_input
    db = gradient.sum(axis=0)
    self.gradient = [dw, db]
    return gradient.dot(self.w)

In [54]:
class ReLULayer(NeuralNetlayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

In [55]:
class SoftmaxOutputLayer(NeuralNetlayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None

    def forward(self, x):
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        return self.cur_probs - target

In [56]:
class MLP:
    def __init__(self, *args: List[NeuralNetlayer]):
        self.layers = args

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, target):
        for layer in self.layers[::-1]:
            target = layer.backward(target)

    def predict(self, x):
        return self.forward(x)
    
    def fit(self, x, y, optimizer, epochs, lr):
        for epoch in range(epochs):
            y_pred = self.forward(x)
            loss = -np.sum(y * np.log(y_pred)) / len(y)
            print(f"Epoch {epoch+1}/{epochs} Loss: {loss}")
            self.backward(y)
            optimizer.step()

In [57]:
class Optimizer:
    def __init__(self, net: MLP):
        self.net = net

    def step(self):
        for layer in self.net.layers[::-1]:
            if layer.parameters is not None:
                self.update(layer.parameters, layer.gradient)

    def update(self, params, gradient):
        raise NotImplementedError

class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float, lamda: float = 0.0):
        super().__init__(net)
        self.lr = lr
        self.lamda = lamda

    # L2 regularization
    def update(self, params, gradient):
        w, b = params
        dw, db = gradient
        w -= self.lr * (dw + self.lamda * w)
        b -= self.lr * db

In [58]:
def train(mlp: MLP, optimizer: Optimizer, data_x, data_y, steps):
    losses = []
    labels = data_y
    for _ in tqdm(range(steps)):
        predictions = mlp.forward(data_x)
        loss = -(labels * np.log(predictions)).sum(axis=-1).mean()
        losses.append(loss)
        mlp.backward(labels)
        optimizer.step()
    plt.plot(losses)
    plt.xlabel("Epoch")
    plt.ylabel("Cross entropy loss")

In [59]:
def train_sgd(mlp, optimizer, X_train, y_train, epochs):
    for epoch in range(epochs):
        # Shuffle the training data
        perm = np.random.permutation(X_train.shape[0])
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
        
        for i in range(X_train.shape[0]):
            # Select a single training example
            X_mini = X_train_shuffled[i:i+1]
            y_mini = y_train_shuffled[i:i+1]
            
            # Perform a forward pass and compute the loss
            predictions = mlp.forward(X_mini)
            loss = -(y_mini * np.log(predictions)).sum()  # Example for cross-entropy loss
            
            # Backward pass to compute gradients
            mlp.backward(y_mini)
            
            # Update model parameters
            optimizer.step()

        # Optionally print the loss here to monitor training progress
        print(f"Epoch {epoch+1}, Loss: {loss}")

In [60]:
def evaluate_acc(mlp: MLP, data_x, data_y):
    predictions = mlp.forward(data_x)
    return np.mean(np.argmax(predictions, axis=-1) == np.argmax(data_y, axis=-1))

In [61]:
n_features = X_train.shape[-1]
HIDDEN_SIZE = 64
OUTPUT_SIZE = 24
GRADIENT_STEPS = 200

mlp2 = MLP(
    LinearLayer(n_features, HIDDEN_SIZE),
    ReLULayer(),
    LinearLayer(HIDDEN_SIZE, HIDDEN_SIZE),
    ReLULayer(),
    LinearLayer(HIDDEN_SIZE, OUTPUT_SIZE),
    SoftmaxOutputLayer()
)
opt2 = GradientDescentOptimizer(mlp2, 1e-3, lamda=1e-2)

# train(mlp2, opt2, X_train, y_train, GRADIENT_STEPS)
mlp2.fit(X_train, y_train, opt2, GRADIENT_STEPS, 1e-2)
print("Training accuracy: ", evaluate_acc(mlp2, X_test, y_test))

Epoch 1/200 Loss: 5.643200090234787


Epoch 2/200 Loss: nan
Epoch 3/200 Loss: nan
Epoch 4/200 Loss: nan
Epoch 5/200 Loss: nan
Epoch 6/200 Loss: nan
Epoch 7/200 Loss: nan
Epoch 8/200 Loss: nan
Epoch 9/200 Loss: nan
Epoch 10/200 Loss: nan
Epoch 11/200 Loss: nan
Epoch 12/200 Loss: nan
Epoch 13/200 Loss: nan
Epoch 14/200 Loss: nan
Epoch 15/200 Loss: nan
Epoch 16/200 Loss: nan
Epoch 17/200 Loss: nan
Epoch 18/200 Loss: nan
Epoch 19/200 Loss: nan
Epoch 20/200 Loss: nan


KeyboardInterrupt: 

In [None]:
print("Test accuracy:", evaluate_acc(mlp2, X_test, y_test))