In [None]:
! pip install names-dataset

In [None]:
from names_dataset import NameDataset, NameWrapper
import numpy as np
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
nd = NameDataset()
names = nd.get_top_names(country_alpha2='AT')['AT']
names = names['M'] + names['F']
letters = sorted(list(set(''.join(names).lower())))
lookup = {}
for i in range(len(letters)):
    lookup[letters[i]] = i

In [None]:
class NamesDs(Dataset):
    def __init__(self, names: list, seq_len: int, lookup: dict = None):
        letters = sorted(list(set(''.join(names).lower())))
        self.seq_len = seq_len
        self.names = names
        self.start_token = 'bos'
        self.end_token = 'eos'
        self.padding_token = 'pad'
        self.lookup = lookup
        if lookup is None:
            self.lookup = {}
            for i in range(len(letters)):
                self.lookup[letters[i]] = i
            self.lookup[self.start_token] = len(letters)
            self.lookup[self.end_token] = len(letters) + 1
            self.lookup[self.padding_token] = len(letters) + 2
        
            self.idx2letter = self._idx2letter()
        
    
    def _idx2letter(self):
        return {self.lookup[k]:k for k in self.lookup}
    
    def num_letters(self):
        return len(self.lookup)
    
    def get_start_token(self):
        return self.lookup[self.start_token]
    
    def get_end_token(self):
        return self.lookup[self.end_token]
        
    def get_padding_token(self):
        return self.lookup[self.padding_token]
    
    def __len__(self):
        return len(self.names)
    
    def idx2name(self, idx):
        ret = []
        for i in idx:
            if i == self.get_end_token():
                break
            elif i == self.get_start_token():
                continue
            ret.append(self.idx2letter[i.item()])
        return ''.join(ret)
    
    def name2idx(self, name):
        chars = list(name)
        ret = [self.lookup[self.start_token]] + [self.lookup[c] for c in chars] + [self.lookup[self.end_token]]
        assert self.seq_len >= len(ret), 'sequnce length exceeds maximal sequence length'
        ret = ret + [self.lookup[self.padding_token]] * (self.seq_len - len(ret))  
        ret = torch.tensor(ret).long()
        return ret
        
    
    def __getitem__(self, idx):
        name = self.names[idx].lower()
        ret = self.name2idx(name)
        return ret  

In [None]:
class NameGenerator(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, num_letters, seq_len):
        super(NameGenerator, self).__init__()
        self.num_letters = # TODO
        self.seq_len = # TODO
        self.hidden_dim = # TODO
        self.word_embeddings = # TODO
        self.lstm = # TODO
        self.out_layer= # TODO
        
    def get_init_hc(self, batch_size):
        h = # TODO
        c = # TODO
        return h,c

    def forward(self, x):
        # TODO
        return out
    
    def generate(self, bos_idx, eos_idx, read_idx=None, max_iter=10, from_top_k=1):
        seq = []
        if read_idx is None:
            seq.append((torch.ones(1)*bos_idx).long())
            (h,c) = self.get_init_hc(1)
            for i in range(max_iter):
                # TODO
        else:
            assert read_idx[0,0] == bos_idx, f'invalid beginning of sequence! Got {read_idx[0]} but expected {bos_idx}'
            # Add for homework
        return seq

In [None]:
def split_data(names, p_train):
    n_train = int(len(names) * p_train)
    # shuffle data
    idx = random.sample(range(len(names)), len(names))
    train_data = [names[i] for i in idx[:n_train]]
    val_data = [names[i] for i in idx[n_train:]]
    return train_data, val_data

In [None]:
seq_len = 50

# create split
train_split, val_split = split_data(names, 0.9)

# create datasets
train_data = NamesDs(train_split, seq_len)
val_data = NamesDs(val_split, seq_len, lookup=train_data.lookup)

# create loader
train_loader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=2)
val_loader = DataLoader(val_data, batch_size=4, shuffle=False, num_workers=2)

In [None]:
def train(epoch, dataloader, model, loss_func, optimizer):
    model.train()
    loss_ls = []
    for i, batch in enumerate(dataloader):
        # TODO
    return loss.item() 

In [None]:
def val(epoch, dataloader, model, loss_func):
    model.eval()
    val_loss = []
    
    n_samples = 0
    loss_sum = 0
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            # TODO     
    avg_loss = loss_sum/n_samples
    return avg_loss                     

In [None]:
def main(n_epochs=20):
    seq_len = 50

    # create split
    train_split, val_split = split_data(names, 0.9)

    # create datasets
    train_data = NamesDs(train_split, seq_len)
    val_data = NamesDs(val_split, seq_len)

    # create loader
    train_loader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_data, batch_size=4, shuffle=False, num_workers=2)
    
    # create loss
    loss_fun = nn.CrossEntropyLoss(ignore_index=train_data.get_padding_token())
    
    # instantiate model
    model = NameGenerator(32, 16, train_data.num_letters(), seq_len)
    
    # optimizer
    optim = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.1)
    
    epochs = n_epochs
    
    for epoch in range(epochs):
        t_loss = train(epoch, train_loader, model, loss_fun, optim)
        v_loss = val(epoch, val_loader, model, loss_fun)
        print(f"epoch: [{epoch}/{n_epochs}]: train_loss = {t_loss:.5f} | val_loss = {v_loss:.5f}")
    return model

In [None]:
model = main(n_epochs=40)

In [None]:
out = model.generate(bos_idx=26, eos_idx=27, from_top_k=2)
train_data.idx2name(out)