In [None]:
import random
import requests
import string
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset

!pip install pytorch_lightning
from pytorch_lightning.callbacks.progress import TQDMProgressBar, RichProgressBar
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import pytorch_lightning as pl


# Encoder

Download a list of baby names:

In [None]:
url = 'https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv'
content = requests.get(url).content
df = pd.read_csv(io.StringIO(content.decode('utf-8')))
df.head()

In [None]:
df = df[['name', 'sex']].drop_duplicates().reset_index(drop=True)
df.head()

In [None]:
print(df['sex'].value_counts())

In [None]:
df_vals = df['name'].value_counts()
df_vals[df_vals==2]

In [None]:
df_unique = df['name'].drop_duplicates().reset_index(drop=True)
len(df_unique)

In [None]:
train_percent = .8
val_percent = .1
test_percent = .1

df_train = df_unique.sample(frac=train_percent, random_state=0xC0FFEE)
df_val_test = df_unique.drop(df_train.index).sample(frac=1.0, random_state=0xC0FFEE)
df_val = df_val_test.sample(frac=0.5, random_state=0xC0FFEE)
df_test = df_val_test.drop(df_val.index)

In [None]:
totals = len(df_train), len(df_val), len(df_test)
print(totals, sum(totals))

In [None]:
chars = ['<pad>'] + ['<sos>'] + list(string.ascii_lowercase) + ['<eos>']
char_to_idx = {char:idx for idx,char in enumerate(chars)}
idx_to_char = {idx: char for char,idx in char_to_idx.items()}
vocab_size = len(char_to_idx)
print(vocab_size)

In [None]:
char_to_idx['a']

In [None]:
idx_to_char[2]

In [None]:
def get_tokenized_names(df):
  # Handle series and df - a bit lazy, but this is fine here
  try:
    names = df['name'].values
  except:
    names = df.values
  ret = []
  for name in names:
    name = name.lower()
    toks = ['<sos>'] + list(name) + ['<eos>']
    ret.append(toks)
  return ret

In [None]:
get_tokenized_names(df_val)[:5]

In [None]:
class LM(nn.Module):
  def __init__(
      self,
      vocab_size: int,
      embedding_dim: int,
      hidden_dim: int,
      num_layers: int
    ):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(
        embedding_dim, hidden_dim, num_layers,
        bidirectional=True, batch_first=True
    )
    self.fc = nn.Linear(2 * hidden_dim,vocab_size)

  def forward(self, seqs):
    # Count the non-pad tokens
    seq_lens = torch.count_nonzero(seqs, dim=1).cpu()
    embeddings = self.embedding(seqs)

    # Feed through LSTM
    _, (hn, _) = self.lstm(
        nn.utils.rnn.pack_padded_sequence(
            embeddings,
            seq_lens,
            batch_first=True,
            enforce_sorted=False
        )
    )
    # Save off hidden state before FC
    self.hidden = torch.cat((hn[0], hn[1]), dim=1)
    out = self.fc(self.hidden)
    return out

lm = LM(vocab_size, 8, 64, 2)
lm

In [None]:
class NameDataset(Dataset):
  def __init__(self, tokenized_names, char_to_id):
    self.tok_names = tokenized_names
    self.char_to_id = char_to_id

    self._samples = []
    for tok_name in self.tok_names:
      for i in range(len(tok_name) - 1):
        partial_seq = [self.char_to_id[tok] for tok in tok_name[:i + 1]]
        next_tok = self.char_to_id[tok_name[i+1]]
        self._samples.append((partial_seq, next_tok))

  def __len__(self):
    return len(self._samples)

  def __getitem__(self, idx):
    return self._samples[idx]

In [None]:
val_names = get_tokenized_names(df_val)
val_ds = NameDataset(val_names, char_to_idx)
val_ds[:len(val_names[0])-1]



In [None]:
for partial, next_tok in val_ds[:len(val_names[0])-1]:
  for tok in partial:
    print(idx_to_char[tok], end=' ')
  print('->', idx_to_char[next_tok])

In [None]:
def collate(batch):
  partials, next_toks = [], []
  for (partial, next_tok) in batch:
    partials.append(torch.tensor(partial))
    next_toks.append(next_tok)

  return (
      pad_sequence(partials, batch_first=True, padding_value=0),
      torch.tensor(next_toks)
  )

In [None]:
next(iter(DataLoader(val_ds, batch_size=2))) # Don't work cause they ain't all equal bro

In [None]:
next(iter(DataLoader(val_ds, batch_size=3, collate_fn=collate, shuffle=True)))

In [None]:
train_dataloader = DataLoader(
     NameDataset(get_tokenized_names(df_train), char_to_idx),
     batch_size=256,
     collate_fn=collate,
     shuffle=True
)

val_dataloader = DataLoader(
     NameDataset(get_tokenized_names(df_val), char_to_idx),
     batch_size=256,
     collate_fn=collate,
     shuffle=True
)

In [None]:
class LitModel(pl.LightningModule):
  def __init__(self, encoder):
    super().__init__()
    self.encoder = encoder

  def _generic_step(self, batch, batch_idx):
    X, y = batch
    out = self.encoder(X)
    loss = F.cross_entropy(out, y)
    return loss

  def forward(self, seq):
    return self.encoder(seq)

  def training_step(self, batch, batch_idx):
    return self._generic_step(batch, batch_idx)

  def validation_step(self, batch, batch_idx):
    loss = self._generic_step(batch, batch_idx)
    self.log('val_loss', loss, prog_bar=True)
    return loss

  def configure_optimizers(self):
    opt = torch.optim.Adam(self.encoder.parameters(), lr=2e-3)
    return opt

In [None]:
encoder = LM(vocab_size, 8, 64, 2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using', device)
lit_model = LitModel(encoder)
trainer = pl.Trainer(
    accelerator='auto', # or `device` from above
    max_epochs=100,
    log_every_n_steps=25,
    callbacks=[
        RichProgressBar(refresh_rate=50),
        EarlyStopping(monitor='val_loss', mode='min', patience=3)
    ]
)

trainer.fit(lit_model, train_dataloader, val_dataloader)

Now we can create a deep copy of the encoder and replace the fully connected layer with an Identity. This makes it so when we call `forward()` we instead return the `self.hidden` rather than the multi-class distribution.

In [None]:
import copy
df_unseen = df.loc[df_test.index]
enc = copy.deepcopy(lit_model.encoder).eval()
enc.fc = nn.Identity()
enc

In [None]:
def name2token_ids(name):
  tok_name = ['<sos>'] + list(name.lower()) + ['<eos>']
  ids = [char_to_idx[tok] for tok in tok_name]
  return torch.tensor(ids).unsqueeze(0)

Notice that the shape is now the 128 dimension vector!

In [None]:
enc(name2token_ids('chris')).shape

# Fine-tuning

In [None]:
df_unseen_train = df_unseen.sample(frac=.5, random_state=0xC0FFEE)
df_useen_val_test = df_unseen.drop(df_unseen_train.index).sample(frac=1.0, random_state=0xC0FFEE)
df_unseen_val = df_useen_val_test.sample(frac=0.1, random_state=0xC0FFEE)
df_unseen_test = df_useen_val_test.drop(df_unseen_val.index)

print(len(df_unseen), len(df_unseen_train), len(df_unseen_val), len(df_unseen_test))

In [None]:
df_unseen_train

In [None]:
df_unseen_train.sex.value_counts()

In [None]:
class NameSexDataset(Dataset):
  def __init__(self, df, char_to_idx):
    tokenized_names = get_tokenized_names(df)
    self.char_to_idx = char_to_idx
    self._samples = []
    boy = [0, 1]
    girl = [1, 0]
    sexes = {
        'boy':boy,
        'girl':girl
    }

    for tok_name, sex in zip(tokenized_names, df['sex'].values):
      ids = [char_to_idx[tok] for tok in tok_name]
      self._samples.append((ids, sexes[sex]))

  def __getitem__(self, idx):
    return self._samples[idx]

  def __len__(self):
    return len(self._samples)


In [None]:
def collate_name_sex(batch):
  names, sexes = [], []
  for (name, sex) in batch:
    names.append(torch.tensor(name))
    sexes.append(sex)

  return (
      pad_sequence(names, batch_first=True, padding_value=0),
      torch.tensor(sexes).float()
  )


In [None]:
class SexPredictor(nn.Module):
  def __init__(self, hidden_dim):
    super().__init__()
    self.l1 = nn.Linear(128, hidden_dim)
    self.out = nn.Linear(hidden_dim, 2)

  def forward(self, x):
    x = F.relu(self.l1(x))
    out = self.out(x)
    return out

In [None]:
unseen_train_dataloader = DataLoader(
     NameSexDataset(df_unseen_train, char_to_idx),
     batch_size=32,
     collate_fn=collate_name_sex,
     shuffle=True
)

unseen_val_dataloader = DataLoader(
     NameSexDataset(df_unseen_val, char_to_idx),
     batch_size=32,
     collate_fn=collate_name_sex,
     shuffle=True
)

unseen_test_dataloader = DataLoader(
     NameSexDataset(df_unseen_test, char_to_idx),
     batch_size=len(df_unseen_test),
     collate_fn=collate_name_sex,
     shuffle=True
)

In [None]:
class LitModelPredictor(pl.LightningModule):
  def __init__(self, model, enc):
    super().__init__()
    self.model = model
    self.enc = enc.eval()
    for param in self.enc.parameters():
      param.requires_grad = False

  def _generic_step(self, batch, batch_idx):
    X, y = batch
    with torch.no_grad():
      name_vec = self.enc(X)
    out = self.model(name_vec)
    loss = F.binary_cross_entropy_with_logits(out, y)
    return loss

  def forward(self, seq):
    with torch.no_grad():
      name_vec = self.enc(seq)
    return self.model(name_vec)

  def training_step(self, batch, batch_idx):
    return self._generic_step(batch, batch_idx)

  def validation_step(self, batch, batch_idx):
    loss = self._generic_step(batch, batch_idx)
    self.log('val_loss', loss, prog_bar=True)
    return loss

  def configure_optimizers(self):
    opt = torch.optim.Adam(self.model.parameters(), lr=1e-3)
    return opt

In [None]:
predictor = SexPredictor(64)

lit_model_pred = LitModelPredictor(predictor, enc)
trainer = pl.Trainer(
    accelerator='auto',
    max_epochs=100,
    log_every_n_steps=25,
    callbacks=[
        RichProgressBar(refresh_rate=50),
        EarlyStopping(monitor='val_loss', mode='min', patience=3)
    ]
)

trainer.fit(lit_model_pred, unseen_train_dataloader, unseen_val_dataloader)

In [None]:
lit_model_pred = lit_model_pred.eval()
with torch.no_grad():
  X, y = next(iter(unseen_test_dataloader))
  out = lit_model_pred(X).argmax(dim=-1)
  acc = (out == y.argmax(dim=-1)).float().mean()
  print('accuracy', acc)
lit_model_pred = lit_model_pred.train()
# trainer.test(lit_model, unseen_test_dataloader)

In [None]:
df_unseen_test['sex'].value_counts()

Not bad! We froze the weights from the original encoder and used just the output from it to train a smaller network with ~8.4k weights. We also fine-tuned on a smaller amount of data and still ended up with almost 77% accuracy on the test set! That's pretty good!

# Decoding schemes


In [None]:
def predict_next_letter(encoder, partial, char_to_idx, idx_to_char, n=1):
  encoder.train(False)
  with torch.no_grad():
    seq = ['<sos>'] + list(partial)
    seq_ids = [char_to_idx[ch] for ch in seq]
    seq_tensor = torch.tensor(seq_ids).unsqueeze(0)
    out = F.softmax(encoder(seq_tensor), dim=-1)
    probs, idxs = torch.sort(out, descending=True)
    probs = probs.cpu().numpy().flatten()[:n]
    idxs = idxs.cpu().numpy().flatten()[:n]
    next_char = [idx_to_char[idx] for idx in idxs]

  encoder.train(True)
  return next_char, probs

In [None]:
predict_next_letter(lit_model.encoder, 'chri', char_to_idx, idx_to_char, n=5)

In [None]:
import math

def greedy_decoder(encoder, max_len, char_to_idx, idx_to_char, partial_name=''):
  ret = partial_name
  curr_len = len(ret)
  curr_tok = None
  total_prob = 0

  while curr_tok != '<eos>' and curr_len < max_len:
    curr_toks, probs = predict_next_letter(encoder, ret, char_to_idx, idx_to_char)
    curr_tok = curr_toks[0]
    prob = probs[0]
    total_prob += math.log(prob)
    if curr_tok != '<eos>':
      ret += curr_tok
  return ret, math.exp(total_prob)

In [None]:
greedy_decoder(lit_model.encoder, 20, char_to_idx, idx_to_char)

In [None]:
greedy_decoder(lit_model.encoder, 20, char_to_idx, idx_to_char, partial_name='b')

In [None]:
greedy_decoder(lit_model.encoder, 20, char_to_idx, idx_to_char, partial_name='c')

In [None]:
greedy_decoder(lit_model.encoder, 20, char_to_idx, idx_to_char, partial_name='chr')

In [None]:
toks, probs = predict_next_letter(lit_model.encoder, 'a', char_to_idx, idx_to_char, n=len(idx_to_char))
nucleus = 0.9
curr = 0
for i, p in enumerate(probs):
    curr += p
    if curr >= nucleus:
        break

fig, ax = plt.subplots(1, 1, figsize=(6,6))
ax.plot(probs)
plt.title(f'Most probable from "a" with nucleus={nucleus}')
plt.xticks(list(range(len(probs))), toks, rotation=70)
plt.axvline(x=i + 1, color='r')
plt.show()

print(np.sum(probs[:i + 1]))
print('tokens needed', toks[:i + 1])
print(f'Total tokens: {i+1}, % toks needed for 90% nucleus: ' +\
  f'{(i+1)/len(idx_to_char)*100:,.02f}%')

In [None]:
def nucleus_decoder(encoder, max_len, idx_to_char, char_to_idx, partial='', nucleus=.9):
  partial_seq = partial
  ret_prob = 0
  while True:
    toks, probs = predict_next_letter(encoder, partial_seq, char_to_idx, idx_to_char, len(idx_to_char))
    curr = 0
    for i, p in enumerate(probs):
      curr += p
      if curr >= nucleus:
        break

    candidates = toks[: i + 1]
    candidate_probs = probs[: i + 1]
    # Re-distribute the probs
    new_probs = candidate_probs / sum(candidate_probs)

    # Sample the characters with their probs
    next_char = np.random.choice(candidates, p=new_probs)
    idx = candidates.index(next_char)
    prob = candidate_probs[idx]
    ret_prob += math.log(prob)
    if next_char != '<eos>':
      partial_seq += next_char

    if next_char == '<eos>' or len(partial_seq) == (max_len - 1):
      break

  return partial_seq, math.exp(ret_prob)


In [None]:
np.random.seed(0)
for _ in range(10):
  name, prob = (nucleus_decoder(lit_model.encoder, 20, idx_to_char, char_to_idx))
  print(name, prob)
  name = name.capitalize()
  in_train = name in df_train.values
  in_val = name in df_val.values
  original = not in_train and not in_val
  print(f'Name is{" " if original else " not"} original')