This notebook trains a minimal Transformer-based model for generating Lithuanian names based on gender.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import requests
from bs4 import BeautifulSoup

### Scrape Lithuanian Names from VLKK Website

In [None]:
names = []

for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    male_links = soup.find_all('a', class_='names_list__links names_list__links--man')
    for link in male_links:
        names.append({'name': link.text, 'gender': 'male'})

    female_links = soup.find_all('a', class_='names_list__links names_list__links--woman')
    for link in female_links:
        names.append({'name': link.text, 'gender': 'female'})

df = pd.DataFrame(names)
df.to_csv('lithuanian_names.csv', index=False)

### Dataset Class and DataLoader

In [None]:
class NameDataset(Dataset):
    def __init__(self, csv_file):
        data = pd.read_csv(csv_file)
        self.names = data['name'].values
        self.genders = data['gender'].values

        self.chars = sorted(list(set(''.join(self.names) + ' ')))
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

        self.gender_to_int = {'male': 0, 'female': 1}
        self.int_to_gender = {0: 'male', 1: 'female'}

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '
        gender = self.genders[idx]

        encoded_name = [self.char_to_int[char] for char in name]
        encoded_gender = self.gender_to_int[gender]

        return torch.tensor(encoded_name), torch.tensor(encoded_gender)

def pad_collate(batch):
    names, genders = zip(*batch)
    padded_seqs = pad_sequence(names, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    genders = torch.stack(genders)
    return input_seq, target_seq, genders

dataset = NameDataset('lithuanian_names.csv')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

### Define Minimal Transformer Model

In [None]:
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion, gender_size):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gender_embed = nn.Embedding(gender_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x, gender):
        gender_emb = self.gender_embed(gender).unsqueeze(1).expand(-1, x.size(1), -1)
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :] + gender_emb
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

### Training Loop

In [None]:
def train_model(model, dataloader, epochs=200):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for input_seq, target_seq, gender in dataloader:
            optimizer.zero_grad()
            output = model(input_seq, gender)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(dataloader)}")

def sample(model, dataset, start_str='A', gender='male', max_length=20):
    model.eval()
    with torch.no_grad():
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)
        gender_tensor = torch.tensor([dataset.gender_to_int[gender]])
        result = start_str
        for _ in range(max_length):
            output = model(input_seq, gender_tensor)
            output = torch.softmax(output[:, -1, :], dim=-1)
            next_char_idx = torch.multinomial(output, num_samples=1).item()
            next_char = dataset.int_to_char[next_char_idx]
            if next_char == ' ':
                break
            result += next_char
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)
        return result

model = MinimalTransformer(dataset.vocab_size, embed_size=32, num_heads=2, forward_expansion=4, gender_size=2)
train_model(model, dataloader, epochs=200)

### Sample Generated Names

In [None]:
male_name = sample(model, dataset, start_str='T', gender='male')
female_name = sample(model, dataset, start_str='I', gender='female')
print(f'Generated Male Name: {male_name}')
print(f'Generated Female Name: {female_name}')

Generated Male Name: TomasGenerated Female Name: Ieva

In [None]:
train_model(model, dataloader, epochs=200)

Epoch 1, Average Loss: 1.4913895290359678Epoch 2, Average Loss: 1.3163452709145225Epoch 3, Average Loss: 1.3060753943891865...Epoch 200, Average Loss: 1.2050035916298274Conservative male names:RalinijusRengasRalvasRìrisRaũgijusCreative female names:SalgiciraSássėSlávkòSmėloniaStanensara