In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Data analysis

In [5]:
male_dataset = pd.read_csv('data/pop_vardai_vyrai.txt',delimiter=' ',encoding="UTF-16")
female_dataset = pd.read_csv('data/pop_vardai_moterys.txt',delimiter=' ',encoding="UTF-16")

In [6]:
print(male_dataset)

            name  popularity
0           Abas           0
1        Abdijus           0
2        Abdonas           0
3         Abdula           0
4         Abelis           2
...          ...         ...
3845  Žigimantas          25
3846  Žigymantas           0
3847    Žigintas           0
3848    Žilvynas           0
3849    Žimantas           0

[3850 rows x 2 columns]


In [7]:
print(female_dataset.shape)
print(male_dataset.shape)

(4235, 2)
(3850, 2)


We have around 8085 names total. How many of those are 0? How many are >100?

In [None]:
print(f"Number of male names with 0 popularity: {(male_dataset['popularity']==0).sum()}")
print(f"Number of male names with >=100 popularity: {(male_dataset['popularity']>=100).sum()}")
print(f"Max popularity for male name: {(male_dataset['popularity']).max()}")
print(f"Average popularity for male name: {(male_dataset['popularity']).mean().round(2)}\n")
print(f"Number of female names with 0 popularity: {(female_dataset['popularity']==0).sum()}")
print(f"Number of female names with >=100 popularity: {(female_dataset['popularity']>=100).sum()}")
print(f"Max popularity for female name: {(female_dataset['popularity']).max()}")
print(f"Average popularity for female name: {(female_dataset['popularity']).mean().round(2)}")



Number of male names with 0 popularity: 1391
Number of male names with >=100 popularity: 404
Max popularity for male name: 17236

Average popularity for male name: 141.73

Number of female names with 0 popularity: 1583
Number of female names with >=100 popularity: 499
Average popularity for female name: 164.3


There are a lot of words with 0 popularity in both datasets. These names are either improper, foreign or otherwise unused in the modern naming scheme.

For the sake of a more accurate model, we will be omiting names with 0 popularity.

Additionally, we will try to see what kind of model we can get if we take the n most popular names as the training data.

# Namesformer model

In [66]:
class NameDataset(Dataset):
    def __init__(self, csv_file):
        self.names = pd.read_csv(csv_file)['name'].values
        self.chars = sorted(list(set(''.join(self.names) + ' ')))  # Including a padding character
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '  # Adding padding character at the end
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)

In [73]:
dataset = NameDataset('data/english_names.txt')
[dataset.int_to_char[num] for num in dataset[0].numpy()]

['e', 'm', 'm', 'a', ' ']

In [52]:
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    return input_seq, target_seq

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

In [53]:
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cpu


In [75]:
model = model.to(device)

In [54]:
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

In [55]:

model = MinimalTransformer(vocab_size=dataset.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model, dataloader)



Epoch 1, Average Loss: 1.4406895644650488
Epoch 2, Average Loss: 1.4062788443651029
Epoch 3, Average Loss: 1.398684309687681
Epoch 4, Average Loss: 1.401524607173935
Epoch 5, Average Loss: 1.3975820101187852
Epoch 6, Average Loss: 1.3991666962643583
Epoch 7, Average Loss: 1.3947380464709924
Epoch 8, Average Loss: 1.3956587194444652
Epoch 9, Average Loss: 1.3965631889963817
Epoch 10, Average Loss: 1.3933474934267664


# Sampling

In [58]:
def sample(model, dataset, start_str='a', max_length=20, temperature=1.0):
    assert temperature > 0, "Temperature must be greater than 0"
    model.eval()  # Switch model to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension
        
        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq)
            
            # Apply temperature scaling
            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)
            
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]
            
            if next_char == ' ':  # Assume ' ' is your end-of-sequence character
                break
            
            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)
        
        return output_name

In [60]:
print('More confident:')
for _ in range(10):
    print(' ', sample(model, dataset, start_str='r', temperature=0.5))  # More confident

print('\nMore diverse/creative:')
for _ in range(10):
    print(' ', sample(model, dataset, start_str='r', temperature=1.5))  # More diverse

More confident:
  raia
  rasly
  raris
  rion
  reniely
  rari
  riaynin
  rene
  relann
  reva

More diverse/creative:
  rucadom
  rijnan
  ro
  rampchia
  rihey
  royciy
  ronlyawn
  rnsud
  rihiepph
  rahr


In [None]:
torch.save(model, 'models/namesformer_model.pt')