# Assignment 3 Option 2

In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

from milligrad import Tensor, Adam

import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt

## 0.1 Loading and processing data

In [2]:
names, labels = [], []
with open("data/ascii_names.txt", "r") as f:
    for line in f:
        *name, label = line.split(" ") # *name since people can have names split by spaces like: De Santis
        name = " ".join(name)
        names.append(name.upper())
        # Append the label to the labels list, converting it to an integer
        labels.append(int(label.replace("\n","")))

In [3]:
ids_to_chr = {i:c for i,c in enumerate(sorted(set("".join(names))))} # find all unique characters, sort them and give them a unique id

# add unique <PAD> token to pad the names to have the same length as the longest name in a batch (cannot construct a Tensor if they have variable lengths)
PAD = "." # normally <PAD> is used but that requires extra considerations when tokenizing (longest substring), so I just use a space
PAD_ID = len(ids_to_chr)
ids_to_chr[PAD_ID] = PAD

chr_to_ids = {c:i for i,c in ids_to_chr.items()}

In [4]:
ids_to_chr.items()

dict_items([(0, ' '), (1, "'"), (2, 'A'), (3, 'B'), (4, 'C'), (5, 'D'), (6, 'E'), (7, 'F'), (8, 'G'), (9, 'H'), (10, 'I'), (11, 'J'), (12, 'K'), (13, 'L'), (14, 'M'), (15, 'N'), (16, 'O'), (17, 'P'), (18, 'Q'), (19, 'R'), (20, 'S'), (21, 'T'), (22, 'U'), (23, 'V'), (24, 'W'), (25, 'X'), (26, 'Y'), (27, 'Z'), (28, '.')])

In [5]:
### Pad all names to have the same length as the longest name (14 characters)
n_len = max(len(n) for n in names)
names = [n + " "*(n_len-len(n)) for n in names]

In [6]:
tokenized_names = np.array([[chr_to_ids[c] for c in name] for name in names])
tokenized_names = np.eye(len(chr_to_ids))[tokenized_names].swapaxes(1,2) # one hot encode and make the length be the last dimension
labels = np.array(labels)
labels_ohe = np.eye(max(labels))[labels-1] # one hot encode

In [7]:
np.random.seed(1337)

N_TRAIN = int(0.7 * len(names))
N_VAL = int(0.2 * len(names))
N_TEST = len(names) - N_TRAIN - N_VAL

shuffle = np.random.permutation(np.arange(len(names)))

x_train, y_train = tokenized_names[shuffle[:N_TRAIN]], labels_ohe[shuffle[:N_TRAIN]]
x_val, y_val = tokenized_names[shuffle[N_TRAIN:N_TRAIN+N_VAL]], labels_ohe[shuffle[N_TRAIN:N_TRAIN+N_VAL]]
x_test, y_test = tokenized_names[shuffle[N_TRAIN+N_VAL:]], labels_ohe[shuffle[N_TRAIN+N_VAL:]]

## 0.2 Hyperparameters

In [8]:
CHANNELS_IN = len(chr_to_ids)
CHANNELS_HIDDEN = 64
KERNEL_SIZE = 7
PADDING = 0 # with stride = 1 and kernel_size = 3, the length of the sequence is preserved

SEQ_LEN = n_len
NUM_CLASSES = max(labels)

## 0.3-0.4 Setting up the model

In [9]:
# does not have special considerations for the sparsity of the data since my aim was to make a generalized convolution 
class SurnameConvNet:
    def __init__(
            self, c_in, c_hidden, kernel_size, padding,
            orig_seq_len, num_classes
        ):
        self.c_in, self.c_hidden = c_in, c_hidden
        self.kernel_size, self.padding = kernel_size, padding
        self.w_out = (orig_seq_len - kernel_size + 2*padding + 1) - kernel_size + 2*padding + 1

        self.k1 = Tensor.randn(c_in, kernel_size, c_hidden) * 0.01
        self.b1 = Tensor.zeros(c_hidden, 1)  # 1 to broadcast over sequence
        self.k2 = Tensor.randn(c_hidden, kernel_size, 16) * 0.01
        self.b2 = Tensor.zeros(16, 1)

        self.w = Tensor.xavier(self.w_out * 16, num_classes)
        self.b = Tensor.zeros(num_classes)

    def __call__(self, x:Tensor)->Tensor:
        x = (x.conv1d(self.k1, padding=self.padding) + self.b1).relu()
        x = (x.conv1d(self.k2, padding=self.padding) + self.b2).relu()
        x = x.reshape(-1, self.w_out * 16) # squeeze away unary dimension (k2 has one filter)
        return x @ self.w + self.b
    
    def parameters(self)->list[Tensor]:
        return [self.k1, self.b1, self.k2, self.b2, self.w, self.b]


model = SurnameConvNet(CHANNELS_IN, CHANNELS_HIDDEN, KERNEL_SIZE, PADDING, SEQ_LEN, NUM_CLASSES)
model(Tensor.randn(32, CHANNELS_IN, SEQ_LEN))

Tensor([[-6.86093832e-03, -2.38754724e-02,  3.80054692e-02,
         1.94992784e-02,  1.73318441e-02, -6.70823566e-03,
         6.50129098e-03, -1.09559665e-02,  1.30422325e-02,
        -5.16477029e-03,  1.51775263e-02, -6.13543536e-04,
        -6.34425790e-03, -1.01101519e-02,  5.17153368e-03,
         3.27473606e-02, -1.93275337e-02,  2.70306716e-02],
       [ 1.76888753e-02,  1.14361475e-02, -8.42982944e-03,
        -2.83563076e-02,  1.48339172e-02, -3.95716283e-02,
        -2.46209099e-03, -2.57989595e-02, -7.76967021e-03,
         1.35986318e-02,  1.01126149e-02,  8.49001093e-03,
         3.58099965e-02,  9.62410815e-03, -2.11195619e-02,
         8.12461218e-03,  4.02766132e-03,  4.60398200e-02],
       [-1.12716670e-02, -7.34374829e-03, -3.26429531e-02,
        -1.86318847e-02,  8.75591783e-04, -1.63100354e-02,
        -1.91175271e-02, -9.92847722e-03, -2.24514147e-02,
        -1.08090186e-02, -2.00467853e-02,  9.63255123e-03,
         2.84967137e-02, -4.52542221e-03, -8.06507512

In [10]:
EPOCHS = 10
BATCH_SIZE = 128
LR = 1e-3

model = SurnameConvNet(CHANNELS_IN, CHANNELS_HIDDEN, KERNEL_SIZE, PADDING, SEQ_LEN, NUM_CLASSES)
optim = Adam(model.parameters())

for epoch in range(EPOCHS):
    train_shuffle = np.random.permutation(len(x_train))
    train_tqdm = tqdm(range(0, len(x_train) // BATCH_SIZE - 1), desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    for i in train_tqdm:
        idxs = train_shuffle[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

        x = Tensor(x_train[idxs])
        y = Tensor(y_train[idxs])

        y_hat = model(x)

        loss = -(y * y_hat.log_softmax()).sum(-1).mean() # sum over classes, mean over batch

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_tqdm.set_postfix({"loss": loss.data})

Epoch 1/10 Training:   0%|          | 0/108 [00:00<?, ?it/s]




TypeError: Tensor.backward() missing 1 required positional argument: 'model'

In [None]:
def accuracy(y, y_hat):
    return (y.argmax(axis=-1)==y_hat.argmax(axis=-1)).mean()

accuracy(y_train, model(Tensor(x_train)).data), accuracy(y_test, model(Tensor(x_test)).data)

(0.7862486640541504, 0.7625935162094764)

In [None]:
from milligrad import topological_sort

[x._grad_fn for x in reversed(topological_sort(loss))]

['-',
 '/',
 'sum',
 'sum',
 '*',
 '',
 'log_softmax',
 '+',
 '@',
 'reshape',
 'relu',
 '+',
 'conv1d',
 'relu',
 '+',
 'conv1d',
 '',
 '*',
 '',
 '',
 '',
 '*',
 '',
 '',
 '',
 '',
 '',
 '']

In [None]:
from milligrad.tensor import broadcast_to

a = Tensor.randn(32, 3, 8, 3)
b = Tensor.randn(8,1)

a+b

Tensor([[[[-0.13666803, -1.02179063, -1.1397524 ],
         [ 3.03002466,  0.91948846, -0.14527703],
         [ 0.88138631, -0.26962489, -0.1883629 ],
         ...,
         [ 2.21184893,  2.15595311,  3.0939568 ],
         [-2.34537385, -0.90707117, -1.88460634],
         [-0.94775035,  1.00241172, -0.91442963]],

        [[-1.79805579, -0.84006114, -0.96236712],
         [-0.73585787,  1.61624028, -0.10198324],
         [ 0.56348433,  0.91862188,  2.25115165],
         ...,
         [ 0.88859012,  1.95467816,  1.27430212],
         [-1.65460244, -1.97226765, -0.99884009],
         [ 0.40449658,  0.59991765, -2.17228369]],

        [[-2.68890982, -1.97764927, -1.29330094],
         [ 0.37209507, -1.08527477,  0.08621177],
         [-0.14142108,  1.69203629,  0.43796505],
         ...,
         [ 0.39458471,  0.38526879,  0.11454493],
         [-0.83742129,  0.05410293, -0.73463601],
         [ 0.40484396,  0.46059399,  1.47915641]]],


       [[[-3.25932127, -0.15217316, -2.88694839],