In [None]:
import re

import torch
import pandas as pd
import matplotlib.pyplot as plt


from utils.forward_pass import *
from utils.backward_pass import *

from utils.weight import *

In [None]:
# df = pd.read_csv("https://raw.githubusercontent.com/amitness/gender-data/refs/heads/master/genders.csv")
df = pd.read_csv("names.csv")
df = pd.DataFrame(df)
# df["name"] = df["name"].apply(lambda x: re.sub(r"\(.*?\)", "", x))
names = df.name.unique()

In [None]:
count = {}
for nm in names:
  if nm[0] in count:
    count[nm[0]] += 1
  else:
    count[nm[0]] = 1

plt.plot(count.keys(), count.values())

In [None]:
vowels = ["."] + sorted(list(set("".join(names))))
char_to_ix = {
    ch: i for i, ch in enumerate(vowels)
}
ix_to_char = {
    i: ch for i, ch in enumerate(vowels)
}
ix_to_char

In [None]:
block_size = 3
for nm in names[:2]:
  block = ["."] * block_size
  nm = nm + "."
  print(nm)
  for ch in nm:
    print(block, ch)
    block = block[1:] + [ch]

In [None]:
def generate_data(data, block_size):
  X = []
  Y = []
  for d in data:
    block = [0] * block_size
    d = d + "." # to indicate the end of a nmae
    for ch in d:
      X.append(block)
      Y.append(char_to_ix[ch])
      block = block[1:] + [char_to_ix[ch]]
  # X, Y = torch.tensor(X), torch.tensor(Y)
  X, Y = np.array(X), np.array(Y)
  return X, Y

In [None]:
total_names = len(names)
num_train, num_test = round(0.8 * total_names), round(0.2 * total_names)

block_size = 3

X_train, Y_train = generate_data(names[:num_train], block_size)
X_test, Y_test = generate_data(names[num_train:], block_size)

X_train[:2]

In [None]:
V = len(vowels); D = 10; seed = None
# seed = 2147483647

init_method="xavier"
# char_embeddings = torch.randn(V, D)
char_embeddings = initialize_weight((V, D), None, seed)

hidden = 100

W1 = initialize_weight((block_size * D, hidden), init_method, seed)
b1 = np.zeros(hidden)

W2 = initialize_weight((hidden, V), init_method, seed)
b2 = np.zeros(V)

# gamma = initialize_weight((hidden,))
# beta = initialize_weight((hidden,))

gamma = np.ones((hidden,))
beta = np.zeros((hidden,))

bn_param = {
  "mode": "train"
}


In [None]:

# learning_exp = np.linspace(-3, -1.5, 1000)
# learning_rates = 10**learning_exp

N, block_size = X_train.shape
batch_size = 500

loss_history = []
# lr_history = []

lr = 0.002
# embed -> affine -> tanh -> affine -> softmax
for i in range(4000):
  random_indices = np.random.randint(0, N, (batch_size,))
  # word embedding
  out, embed_cache = word_embedding_forward(X_train[random_indices], char_embeddings)

  # affine
  out, cache1 = affine_forward(out, W1, b1)

  # batch norm
  out, bn1_cache = batchnorm_forward(out, gamma, beta, bn_param)

  # tanh
  out, tan_cache = tanh_forward(out)

  # affine
  out, cache2 = affine_forward(out, W2, b2)

  loss, dscores = softmax_loss(out, Y_train[random_indices])
  loss_history.append(loss)
  # lr_history.append(learning_exp[i])

  if i % 100 == 0:
    print(f"Loss at iteration: {i} -> {loss}")

  # --------------------------------
  #         backward pass
  # --------------------------------

  dx, dw, db = affine_backward(dscores, cache2)

  W2 -= lr * dw
  b2 -= lr * db

  dx = tanh_backward(dx, tan_cache)

  dx, dgamma, dbeta = batchnorm_backward(dx, bn1_cache)

  gamma -= lr * dgamma
  dbeta -= lr * dbeta

  dx, dw, db = affine_backward(dx, cache1)

  W1 -= lr * dw
  b1 -= lr * db

  dx = dx.reshape(batch_size, block_size, D)

  dw = word_embedding_backward(dx, embed_cache)

  char_embeddings -= lr * dw


In [None]:
# plt.plot(lr_history, loss_history)
plt.plot(loss_history)
plt.show()

### Learning

Increasing the batch size helped with variation in names, going from batch size of 100 to 500, however the gradients exploded because the learning rate was too high.
So, reduced the learning rate to 0.002 from 0.02. Names are much more plausible looking now. When the batch size was rather smaller, had the problem of most of the names starting 
with letter "p".

In [None]:
# sampling
np.random.seed(None)
num_samples = 10
for _ in range(num_samples):
  output = []
  name = [0] * block_size
  while True:
    name_ = np.array(name).reshape(1, block_size)

    out, _ = word_embedding_forward(name_, char_embeddings)

    # affine
    out, _ = affine_forward(out, W1, b1)

    # batch normalization
    bn_param["mode"] = "test"
    out, _ = batchnorm_forward(out, gamma, beta, bn_param)

    # tanh
    out, _ = tanh_forward(out)

    # affine
    out, _ = affine_forward(out, W2, b2)

    dscores = softmax_loss(out)
    index = np.random.choice(a=V, size=1, p=dscores[0]).item()

    if index == 0:
      break

    name = name[1:] + [index]
    output.append(index)
  print(''.join(ix_to_char[ix] for ix in output))
