In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import unicodedata
import requests
from bs4 import BeautifulSoup
import random

In [None]:
names = []
for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', class_='names_list__links names_list__links--man')
    names += [name.text for name in links]

np.savetxt('vardai.txt', names, fmt='%s', header='name', comments='', newline='\n')

Panaikinau kirčius, kad nesigautų vardų kaip Siontr̃rovyìs.
Taip pat panaikinau didžiąsias raides , dėl kodo efektyvumo.

In [None]:
class NameDataset(Dataset):
    def __init__(self, csv_file):
        # Load and preprocess names
        self.names = self._preprocess_names(pd.read_csv(csv_file)['name'].values)

        # Build vocabulary (characters + padding space)
        lithuanian_letters = "ąčęėįšųū"
        self.chars = sorted(list(set(''.join(self.names)+ lithuanian_letters + ' ')))  # Including a padding character
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def _preprocess_names(self, names):
        """Removes accentuation and normalizes the names."""
        return [
            ''.join(
                c for c in unicodedata.normalize('NFD', name)
                if unicodedata.category(c) != 'Mn'
            ).lower()
            for name in names
        ]

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        # Add a padding character at the end
        name = self.names[idx] + ' '
        # Encode the name into integers
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)

In [None]:
dataset = NameDataset('vardai.txt')

# Custom collate function for padding
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    return input_seq, target_seq

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

Epochų skaičių palikau ties 50, nes su didesniu skaičiu vardai tampa prasti

In [None]:
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

# Training Loop
def train_model(model, dataloader, epochs=50):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),lr=0.001)

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

model = MinimalTransformer(vocab_size=dataset.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model, dataloader)



Epoch 1, Average Loss: 1.3011297884066242
Epoch 2, Average Loss: 1.16333576224067
Epoch 3, Average Loss: 1.1374777299313505
Epoch 4, Average Loss: 1.139126143179649
Epoch 5, Average Loss: 1.1274003553981624
Epoch 6, Average Loss: 1.1301050196009235
Epoch 7, Average Loss: 1.119457402012565
Epoch 8, Average Loss: 1.120065768888174
Epoch 9, Average Loss: 1.1212796278236326
Epoch 10, Average Loss: 1.117823187969933
Epoch 11, Average Loss: 1.1093977118326612
Epoch 12, Average Loss: 1.1058679247690626
Epoch 13, Average Loss: 1.1042931439462773
Epoch 14, Average Loss: 1.10312314959597
Epoch 15, Average Loss: 1.101318230313703
Epoch 16, Average Loss: 1.1069104129617864
Epoch 17, Average Loss: 1.1092007303040874
Epoch 18, Average Loss: 1.1016023188583122
Epoch 19, Average Loss: 1.1042215533492978
Epoch 20, Average Loss: 1.100310335474566
Epoch 21, Average Loss: 1.1083885997780099
Epoch 22, Average Loss: 1.0956577008420771
Epoch 23, Average Loss: 1.0988395534271052
Epoch 24, Average Loss: 1.0996

Įdiegiau keletą papildomų sąlygų, kad vyriški vardai būtų lietuviški :


1.   Ilgiausias vardas Lietuvoje yra Konstantinas - 12 raidžių , todėl modelis generuoja tik maksimaliai 13 raidžių vardus
2.   Vardai turi baigtis raide 's'
3.   Varduose prieš raidę 's' turi būti balsė.





In [None]:
def sample(model, dataset, start_str='a', max_length=13,temperature =1):
    assert temperature > 0
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension

        output_name = start_str
        vowels = set("aeiouy")  # Allowed vowels

        for _ in range(max_length - len(start_str)):
            output = model(input_seq)

            # Get the last character from the output
            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ':  # Assume ' ' is your end-of-sequence character
                break

            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        # Enforce the name ends with a vowel followed by 's'
        if len(output_name) < max_length:
            # If it doesn't already end with a vowel and 's', adjust it
            if len(output_name) < 2 or output_name[-1] != 's' or output_name[-2] not in vowels:
                last_vowel = random.choice(list(vowels))  # Randomly pick a vowel
                output_name = output_name.rstrip()[:-1] + last_vowel + 's'

        return output_name

# After training your model, generate a name starting with a specific letter
for _ in range(15):
    generated_name = sample(model, dataset, start_str='e',temperature = 0.5)
    print(generated_name.capitalize())

Emidlis
Emidinas
Emiris
Emifelis
Emisys
Emidonas
Emidis
Emijas
Eminantonas
Emijutas
Emimanijus
Emiurtijus
Emivys
Emimas
Emivydas
