# Assignment 3 Option 2

In [None]:
# autoreload
%load_ext autoreload
%autoreload 2

from milligrad import Tensor, Adam

import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt

## 0.1 Loading and processing data

In [None]:
names, labels = [], []
with open("data/ascii_names.txt", "r") as f:
    for line in f:
        *name, label = line.split(" ") # *name since people can have names split by spaces like: De Santis
        name = " ".join(name)
        names.append(name.upper())
        # Append the label to the labels list, converting it to an integer
        labels.append(int(label.replace("\n","")))

In [None]:
ids_to_chr = {i:c for i,c in enumerate(sorted(set("".join(names))))} # find all unique characters, sort them and give them a unique id

# add unique <PAD> token to pad the names to have the same length as the longest name in a batch (cannot construct a Tensor if they have variable lengths)
PAD = "." # normally <PAD> is used but that requires extra considerations when tokenizing (longest substring), so I just use a space
PAD_ID = len(ids_to_chr)
ids_to_chr[PAD_ID] = PAD

chr_to_ids = {c:i for i,c in ids_to_chr.items()}

In [None]:
ids_to_chr.items()

In [None]:
### Pad all names to have the same length as the longest name (14 characters)
n_len = max(len(n) for n in names)
names = [n + " "*(n_len-len(n)) for n in names]

In [None]:
tokenized_names = np.array([[chr_to_ids[c] for c in name] for name in names])
tokenized_names = np.eye(len(chr_to_ids))[tokenized_names].swapaxes(1,2) # one hot encode and make the length be the last dimension
labels = np.array(labels)
labels_ohe = np.eye(max(labels))[labels-1] # one hot encode

In [None]:
np.random.seed(1337)

N_TRAIN = int(0.7 * len(names))
N_VAL = int(0.2 * len(names))
N_TEST = len(names) - N_TRAIN - N_VAL

shuffle = np.random.permutation(np.arange(len(names)))

x_train, y_train = tokenized_names[shuffle[:N_TRAIN]], labels_ohe[shuffle[:N_TRAIN]]
x_val, y_val = tokenized_names[shuffle[N_TRAIN:N_TRAIN+N_VAL]], labels_ohe[shuffle[N_TRAIN:N_TRAIN+N_VAL]]
x_test, y_test = tokenized_names[shuffle[N_TRAIN+N_VAL:]], labels_ohe[shuffle[N_TRAIN+N_VAL:]]

## 0.2 Hyperparameters

In [None]:
CHANNELS_IN = len(chr_to_ids)
CHANNELS_HIDDEN = 64
KERNEL_SIZE = 7
PADDING = 0 # with stride = 1 and kernel_size = 3, the length of the sequence is preserved

SEQ_LEN = n_len
NUM_CLASSES = max(labels)

## 0.3-0.4 Setting up the model

In [None]:
# does not have special considerations for the sparsity of the data since my aim was to make a generalized convolution 
class SurnameConvNet:
    def __init__(
            self, c_in, c_hidden, kernel_size, padding,
            orig_seq_len, num_classes
        ):
        self.c_in, self.c_hidden = c_in, c_hidden
        self.kernel_size, self.padding = kernel_size, padding
        self.w_out = (orig_seq_len - kernel_size + 2*padding + 1) - kernel_size + 2*padding + 1

        self.k1 = Tensor.randn(c_in, kernel_size, c_hidden) * 0.01
        self.b1 = Tensor.zeros(c_hidden, 1)  # 1 to broadcast over sequence
        self.k2 = Tensor.randn(c_hidden, kernel_size, 16) * 0.01
        self.b2 = Tensor.zeros(16, 1)

        self.w = Tensor.xavier(self.w_out * 16, num_classes)
        self.b = Tensor.zeros(num_classes)

    def __call__(self, x:Tensor)->Tensor:
        x = (x.conv1d(self.k1, padding=self.padding) + self.b1).relu()
        x = (x.conv1d(self.k2, padding=self.padding) + self.b2).relu()
        x = x.reshape(-1, self.w_out * 16) # squeeze away unary dimension (k2 has one filter)
        return x @ self.w + self.b
    
    def parameters(self)->list[Tensor]:
        return [self.k1, self.b1, self.k2, self.b2, self.w, self.b]


model = SurnameConvNet(CHANNELS_IN, CHANNELS_HIDDEN, KERNEL_SIZE, PADDING, SEQ_LEN, NUM_CLASSES)
model(Tensor.randn(32, CHANNELS_IN, SEQ_LEN))

In [None]:
EPOCHS = 10
BATCH_SIZE = 128
LR = 1e-3

model = SurnameConvNet(CHANNELS_IN, CHANNELS_HIDDEN, KERNEL_SIZE, PADDING, SEQ_LEN, NUM_CLASSES)
optim = Adam(model.parameters())

for epoch in range(EPOCHS):
    train_shuffle = np.random.permutation(len(x_train))
    train_tqdm = tqdm(range(0, len(x_train) // BATCH_SIZE - 1), desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    for i in train_tqdm:
        idxs = train_shuffle[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

        x = Tensor(x_train[idxs])
        y = Tensor(y_train[idxs])

        y_hat = model(x)

        loss = -(y * y_hat.log_softmax()).sum(-1).mean() # sum over classes, mean over batch

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_tqdm.set_postfix({"loss": loss.data})

In [None]:
def accuracy(y, y_hat):
    return (y.argmax(axis=-1)==y_hat.argmax(axis=-1)).mean()

accuracy(y_train, model(Tensor(x_train)).data), accuracy(y_test, model(Tensor(x_test)).data)

In [None]:
from milligrad import topological_sort

[x._grad_fn for x in reversed(topological_sort(loss))]

In [None]:
from milligrad.tensor import broadcast_to

a = Tensor.randn(32, 3, 8, 3)
b = Tensor.randn(8,1)

a+b