<a href="https://colab.research.google.com/github/Eglza/Vard-generatorius/blob/main/Projektas3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Lietuviškų vardų generatorius**


In [2]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.40.2-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m


In [3]:
import requests
from bs4 import BeautifulSoup

# Create function to scrape and save names
def download_names():
    names_male = []
    names_female = []

    for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
        url = f'https://vardai.vlkk.lt/sarasas/{key}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Scrape male names
        links_male = soup.find_all('a', class_='names_list__links names_list__links--man')
        names_male += [name.text for name in links_male]

        # Scrape female names
        links_female = soup.find_all('a', class_='names_list__links names_list__links--woman')
        names_female += [name.text for name in links_female]

    # Save male names to a text file
    with open('vardai_male.txt', 'w', encoding='utf-8') as f:
        for name in names_male:
            f.write(f"{name}\n")

    # Save female names to a text file
    with open('vardai_female.txt', 'w', encoding='utf-8') as f:
        for name in names_female:
            f.write(f"{name}\n")

    print(f"Downloaded {len(names_male)} male names and {len(names_female)} female names.")

# Call the function to download names
download_names()


Downloaded 3850 male names and 4235 female names.


In [4]:
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence

# Define the NameDataset class to load and process name datasets
class NameDataset(torch.utils.data.Dataset):
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.names = f.read().splitlines()

        # Create char-to-int and int-to-char mappings
        self.char_to_int = {char: idx for idx, char in enumerate(set("".join(self.names)))}
        self.int_to_char = {idx: char for char, idx in self.char_to_int.items()}
        self.vocab_size = len(self.char_to_int)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        int_sequence = [self.char_to_int[char] for char in name]
        return torch.tensor(int_sequence)

# Define the NameGenerator model (LSTM-based model for name generation)
class NameGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(NameGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out)
        return output

# Define a function for padding sequences within a batch
def pad_collate(batch):
    padded_sequences = pad_sequence(batch, batch_first=True, padding_value=0)
    targets = padded_sequences[:, 1:]
    inputs = padded_sequences[:, :-1]
    return inputs, targets

# Define a function for training the model

def train_model(dataset, model, num_epochs=0, batch_size=32):  # Change num_epochs to 100
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output.contiguous().view(-1, dataset.vocab_size), targets.contiguous().view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


# Train the male model
dataset_male = NameDataset('vardai_male.txt')
model_male = NameGenerator(dataset_male.vocab_size)
train_model(dataset_male, model_male)
torch.save(model_male.state_dict(), 'model_male.pth')

# Train the female model
dataset_female = NameDataset('vardai_female.txt')
model_female = NameGenerator(dataset_female.vocab_size)
train_model(dataset_female, model_female)
torch.save(model_female.state_dict(), 'model_female.pth')


In [5]:
!ls


app.py	model_female.pth  model_male.pth  sample_data  vardai_female.txt  vardai_male.txt


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch.nn.utils.rnn import pad_sequence

# 1. Define the NameDataset class
import pandas as pd
from torch.utils.data import Dataset

class NameDataset(Dataset):
    def __init__(self, txt_file):
        # Read names from the TXT file, one name per line
        with open(txt_file, 'r', encoding='utf-8') as f:
            self.names = [line.strip() for line in f] #strip() removes leading/trailing whitespace

        # Create a character set from the names and include a space (padding character)
        self.chars = sorted(list(set(''.join(self.names)))) + [' ']
        self.vocab_size = len(self.chars)

        # Create dictionaries for character-to-index and index-to-character mapping
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for i, c in enumerate(self.chars)}

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '  # Adding padding character at the end
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)


In [7]:
# Custom collate function for padding
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)  # Pad sequences to the same length
    input_seq = padded_seqs[:, :-1]  # All but the last character for input
    target_seq = padded_seqs[:, 1:]  # All but the first character for target
    return input_seq, target_seq


In [9]:
# 2. Define the MinimalTransformer model
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)  # Embedding layer for character input
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))  # Positional encoding
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)  # Output layer to predict next character

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)  # Positional encoding for each input
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]  # Add positional encoding
        x = self.transformer_encoder(x)  # Pass through the transformer encoder
        x = self.output_layer(x)  # Output layer to get predictions
        return x


In [10]:
# 3. Training Loop
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()  # Loss function for classification (predicting the next character)
    optimizer = optim.Adam(model.parameters())  # Optimizer (Adam)

    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            optimizer.zero_grad()  # Zero the gradients
            output = model(input_seq)  # Get model predictions
            loss = criterion(output.transpose(1, 2), target_seq)  # Compute the loss
            loss.backward()  # Backpropagate the loss
            optimizer.step()  # Update the model parameters

            total_loss += loss.item()  # Accumulate total loss
            batch_count += 1

        average_loss = total_loss / batch_count  # Calculate average loss for the epoch
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')


In [11]:
# 4. Sampling function
def sample(model, dataset, start_str='a', max_length=10, temperature=1):
    assert temperature > 0, "Temperature must be greater than 0"
    model.eval()  # Switch model to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension

        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq)

            # Apply temperature scaling
            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)

            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ':  # Stop if end-of-sequence character (space) is reached
                break

            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        return output_name


In [12]:
# 5. Load the datasets and create dataloaders
dataset_male = NameDataset('vardai_male.txt')
dataset_female = NameDataset('vardai_female.txt')

dataloader_male = DataLoader(dataset_male, batch_size=32, shuffle=True, collate_fn=pad_collate)
dataloader_female = DataLoader(dataset_female, batch_size=32, shuffle=True, collate_fn=pad_collate)

# 6. Initialize and train the models for male and female datasets separately

# Model for male names
model_male = MinimalTransformer(vocab_size=dataset_male.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model_male, dataloader_male, epochs=10)

# Model for female names
model_female = MinimalTransformer(vocab_size=dataset_female.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
train_model(model_female, dataloader_female, epochs=10)




Epoch 1, Average Loss: 1.5472252625079195
Epoch 2, Average Loss: 1.306915528025509
Epoch 3, Average Loss: 1.281148790820571
Epoch 4, Average Loss: 1.2563166007522708
Epoch 5, Average Loss: 1.2453235506026212
Epoch 6, Average Loss: 1.2437746657812891
Epoch 7, Average Loss: 1.236407816902665
Epoch 8, Average Loss: 1.240038552560097
Epoch 9, Average Loss: 1.2262902560312885
Epoch 10, Average Loss: 1.2238801299047863
Epoch 1, Average Loss: 1.6172374746853249
Epoch 2, Average Loss: 1.3963527544996792
Epoch 3, Average Loss: 1.3697000051799573
Epoch 4, Average Loss: 1.3459201441671615
Epoch 5, Average Loss: 1.3384021305500116
Epoch 6, Average Loss: 1.3347384786247312
Epoch 7, Average Loss: 1.3272630101756047
Epoch 8, Average Loss: 1.320498504136738
Epoch 9, Average Loss: 1.325978676179298
Epoch 10, Average Loss: 1.315123378782344


In [14]:
# 7. Generate names with the trained models

# Generate male names
print('Male names:')
for _ in range(10):
    generated_name_male = sample(model_male, dataset_male, start_str='A', temperature=0.5)
    print(generated_name_male)

# Generate female names
print('\nFemale names:')
for _ in range(10):
    generated_name_female = sample(model_female, dataset_female, start_str='At', temperature=1)
    print(generated_name_female)


Male names:
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA
AAAAAAAAAA

Female names:
Atelė
Atìlì
Atìnė́nija
Atedmatė
Atãsė
Atì
Atãžė
Atene
Ateosė
Atrẽtė


In [35]:
# Load datasets
dataset_male = NameDataset('vardai_male.txt')
dataset_female = NameDataset('vardai_female.txt')

# Initialize dataloaders
dataloader_male = DataLoader(dataset_male, batch_size=32, shuffle=True, collate_fn=pad_collate)
dataloader_female = DataLoader(dataset_female, batch_size=32, shuffle=True, collate_fn=pad_collate)

# Initialize models
model_male = MinimalTransformer(vocab_size=dataset_male.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
model_female = MinimalTransformer(vocab_size=dataset_female.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)

# Train models
train_model(model_male, dataloader_male, epochs=50)
train_model(model_female, dataloader_female, epochs=50)

# Save models
torch.save(model_male.state_dict(), 'model_male.pth')
torch.save(model_female.state_dict(), 'model_female.pth')

print("Models saved as 'model_male.pth' and 'model_female.pth'.")


Epoch 1, Average Loss: 1.5628551628964007
Epoch 2, Average Loss: 1.3031216712037395
Epoch 3, Average Loss: 1.2637528703232441
Epoch 4, Average Loss: 1.2639398909797352
Epoch 5, Average Loss: 1.248161835611359
Epoch 6, Average Loss: 1.2535044339077532
Epoch 7, Average Loss: 1.236381307613751
Epoch 8, Average Loss: 1.2319335986760036
Epoch 9, Average Loss: 1.2153853685402674
Epoch 10, Average Loss: 1.2235587431379586
Epoch 11, Average Loss: 1.224590501509422
Epoch 12, Average Loss: 1.2308696094623282
Epoch 13, Average Loss: 1.2175002925652119
Epoch 14, Average Loss: 1.2109722564043093
Epoch 15, Average Loss: 1.2168528910510796
Epoch 16, Average Loss: 1.2166226151560948
Epoch 17, Average Loss: 1.2082349288562113
Epoch 18, Average Loss: 1.2043270857866146
Epoch 19, Average Loss: 1.204900816945005
Epoch 20, Average Loss: 1.2142493463744801
Epoch 21, Average Loss: 1.199482084798419
Epoch 22, Average Loss: 1.1995751217377086
Epoch 23, Average Loss: 1.2083317706407595
Epoch 24, Average Loss: 1

In [38]:
print('Male names:')
for _ in range(10):
    generated_name_male = sample(model_male, dataset_male, start_str='A', temperature= 1)
    print(generated_name_male)

# Generate female names
print('\nFemale names:')
for _ in range(10):
    generated_name_female = sample(model_female, dataset_female, start_str='Be', temperature=1)
    print(generated_name_female)

Male names:
Arovintãn
Ãdvijus
Aurvijus
Ãdelaris
Aimántas
Ailmastas
Aìnius
Ãrfimis
Aìntas
Ãgaras

Female names:
Belve
Belingė
Berija
Bevicija
Bečidzijà
Belė
Berija
Beina
Beviū́ija
Beninė


In [44]:
%%writefile app.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import os
import streamlit as st

# 1. Define the MinimalTransformer model
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)  # Embedding layer for character input
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))  # Positional encoding
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)  # Output layer to predict next character

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)  # Positional encoding for each input
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]  # Add positional encoding
        x = self.transformer_encoder(x)  # Pass through the transformer encoder
        x = self.output_layer(x)  # Output layer to get predictions
        return x


# 2. Define the NameDataset class
class NameDataset(Dataset):
    def __init__(self, txt_file):
        with open(txt_file, 'r', encoding='utf-8') as f:
            self.names = [line.strip() for line in f]
        self.chars = sorted(list(set(''.join(self.names)))) + [' ']
        self.vocab_size = len(self.chars)
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for i, c in enumerate(self.chars)}

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)


# Function to generate a name from the trained model
def generate_name(model, dataset, start_chars, max_length=30, temperature=1.0):
    model.eval()
    start_chars = start_chars.capitalize()  # Ensure only the first character is uppercase
    input_tensor = torch.tensor([dataset.char_to_int[char] for char in start_chars], dtype=torch.long).unsqueeze(0)
    generated_name = start_chars

    with torch.no_grad():
        for _ in range(max_length - len(start_chars)):
            output = model(input_tensor)
            logits = output[0, -1, :] / temperature  # Adjust temperature to control randomness
            probabilities = F.softmax(logits, dim=-1).cpu().numpy()

            # Sample from the probabilities to choose the next character
            predicted_char_idx = torch.multinomial(torch.tensor(probabilities), num_samples=1).item()
            predicted_char = dataset.int_to_char[predicted_char_idx]

            if predicted_char == ' ':
                break
            generated_name += predicted_char
            input_tensor = torch.cat([input_tensor, torch.tensor([[predicted_char_idx]], dtype=torch.long)], dim=1)

    return generated_name


# Streamlit interface
st.title("Lietuviškų vardų generatorius")

# User selects name type
name_type = st.selectbox("Pasirinkite vardo tipą", ["Vyriškas", "Moteriškas"])

# Load the correct dataset and model
if not os.path.exists('model_male.pth') or not os.path.exists('model_female.pth'):
    st.error("Model files are missing. Please ensure 'model_male.pth' and 'model_female.pth' are in the correct directory.")
else:
    if name_type == "Vyriškas":
        dataset = NameDataset('vardai_male.txt')
        model = MinimalTransformer(vocab_size=dataset.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
        model.load_state_dict(torch.load('model_male.pth'), strict=False)
    else:
        dataset = NameDataset('vardai_female.txt')
        model = MinimalTransformer(vocab_size=dataset.vocab_size, embed_size=128, num_heads=8, forward_expansion=4)
        model.load_state_dict(torch.load('model_female.pth'), strict=False)

    # Input field for the starting letter(s)
    start_chars = st.text_input("Įveskite pradines raides", "")

    # Slider for temperature

    temperature = st.slider("Pasirinkite atsitiktinumo lygį (temperatūra)", 0.5, 2.0, 1.0)

    # Button to generate a name
    if st.button("Generuoti vardą"):
        if start_chars:
            # Generate and display the name
            name = generate_name(model, dataset, start_chars, temperature=temperature)
            st.write(f"Sugeneruotas Vardas: {name}")
        else:
            st.warning("Prašome įvesti pradines raides!")


Overwriting app.py


In [45]:
from pyngrok import ngrok, conf

# Configure ngrok with your authentication token
ngrok.set_auth_token("2pA2nwpPDy9sd9LDIWbHgGVGePk_7r2jWaiKrYhnRDMpQd7r1")

# Configure ngrok to use the default free server region
conf.get_default().region = "us"

# Set up the Streamlit tunnel on port 8501 (the default for Streamlit)
tunnel = ngrok.connect(8501)

# Get the public URL of the tunnel
public_url = tunnel.public_url
print(f"Streamlit app is live at: {public_url}")

Streamlit app is live at: https://4930-34-125-74-18.ngrok-free.app


In [46]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.74.18:8501[0m
[0m
  model.load_state_dict(torch.load('model_male.pth'), strict=False)
2024-12-03 13:34:26.699 Examining the path of torch.classes raised: Tried to instantiate class '__path__._path', but it does not exist! Ensure that it is registered via torch::class_




[34m  Stopping...[0m
[34m  Stopping...[0m
