In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

import utils_prep
import RNNEncoder

# Загрузка и анализ данных (№2, a)

In [6]:
df = pd.read_csv(r'data/babynames_extended.csv')

In [7]:
df

Unnamed: 0,Name,Gender,Name_RU
0,John,boy,Джон
1,William,boy,Уильям
2,James,boy,Джеймс
3,Charles,boy,Чарльз
4,George,boy,Джордж
...,...,...,...
6777,Laylah,girl,Лейла
6778,Carleigh,girl,Карли
6779,Kenley,girl,Кенли
6780,Sloane,girl,Слоан


In [8]:
df.Gender.value_counts()/df.shape[0]*100

Gender
boy     50.678266
girl    49.321734
Name: count, dtype: float64

# Подготовка данных (№2, b-d)

In [28]:
PAD_token = "."  # Padding
SOS_token = "<"  # Start Of Sequence
EOS_token = ">"  # End Of Sequence

In [29]:
df['Name'] = df['Name'].str.lower().str.strip()
df['Name_RU'] = df['Name_RU'].str.lower().str.strip()

In [30]:
eng_char2idx, eng_idx2char = utils_prep.build_vocab(df['Name'].tolist(), specials = [SOS_token, EOS_token, PAD_token])
rus_char2idx, rus_idx2char = utils_prep.build_vocab(df['Name_RU'].tolist(), specials = [SOS_token, EOS_token, PAD_token])

In [31]:
df[df['Name_RU'].str.contains(" ")]

Unnamed: 0,Name,Gender,Name_RU
712,ivory,boy,слоновая кость
962,gee,boy,ну и дела
1009,deforest,boy,вырубка леса
1235,josephus,boy,иосиф флавий
3818,arie,girl,орлиное гнездо
4528,nova,girl,новая звезда
5871,sunshine,girl,солнечный свет


In [32]:
X = [utils_prep.encode_name(name, eng_char2idx, SOS_token, EOS_token) for name in df['Name']]
y = [utils_prep.encode_name(name, rus_char2idx, SOS_token, EOS_token) for name in df['Name_RU']]

In [33]:
X_padded, X_max_length = utils_prep.pad_sequences(X, eng_char2idx[PAD_token])
y_padded, y_max_length = utils_prep.pad_sequences(y, rus_char2idx[PAD_token])

In [58]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_padded, y_padded, test_size=0.2, random_state=42
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [59]:
X_train_t = torch.tensor(X_train, dtype=torch.long)
X_valid_t = torch.tensor(X_valid, dtype=torch.long) 
X_test_t  = torch.tensor(X_test, dtype=torch.long)

y_train_t = torch.tensor(y_train, dtype=torch.long)
y_valid_t = torch.tensor(y_valid, dtype=torch.long)
y_test_t  = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_t, y_train_t)
valid_dataset = TensorDataset(X_valid_t, y_valid_t)
test_dataset  = TensorDataset(X_test_t,  y_test_t)

In [60]:
BATCH_SIZE = 64
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# RNN Encoder. №3

In [68]:
model = RNNEncoder.RNNEncoder(len(eng_char2idx))

In [55]:
num_epochs = 10

In [71]:
%%time
train_losses, valid_losses = RNNEncoder.train_rnn_encoder(
    model, train_loader, valid_loader, pad_idx = eng_char2idx[PAD_token], epochs=num_epochs
)

Epoch 1: 100%|██████████| 85/85 [00:00<00:00, 113.55it/s]


Epoch 1: Train=1.9105, Valid=2.0285


Epoch 2: 100%|██████████| 85/85 [00:00<00:00, 158.00it/s]


Epoch 2: Train=1.8906, Valid=2.0189


Epoch 3: 100%|██████████| 85/85 [00:00<00:00, 154.18it/s]


Epoch 3: Train=1.8733, Valid=2.0180


Epoch 4: 100%|██████████| 85/85 [00:00<00:00, 153.22it/s]


Epoch 4: Train=1.8586, Valid=2.0189


Epoch 5: 100%|██████████| 85/85 [00:00<00:00, 148.84it/s]


Epoch 5: Train=1.8432, Valid=2.0159


Epoch 6: 100%|██████████| 85/85 [00:00<00:00, 153.32it/s]


Epoch 6: Train=1.8273, Valid=2.0148


Epoch 7: 100%|██████████| 85/85 [00:00<00:00, 121.84it/s]


Epoch 7: Train=1.8136, Valid=2.0129


Epoch 8: 100%|██████████| 85/85 [00:00<00:00, 125.47it/s]


Epoch 8: Train=1.7999, Valid=2.0128


Epoch 9: 100%|██████████| 85/85 [00:00<00:00, 148.75it/s]


Epoch 9: Train=1.7862, Valid=2.0163


Epoch 10: 100%|██████████| 85/85 [00:00<00:00, 160.82it/s]

Epoch 10: Train=1.7730, Valid=2.0135
CPU times: user 5.64 s, sys: 341 ms, total: 5.98 s
Wall time: 3.99 s





In [73]:
generated_names = RNNEncoder.generate_names(model, sos_idx = eng_char2idx[SOS_token], eos_idx = eng_char2idx[EOS_token], pad_idx = eng_char2idx[PAD_token], eng_idx2char = eng_idx2char, max_len = X_max_length)
print("\nGenerated English names:")
for i, name in enumerate(generated_names, 1):
    print(f"{i:2d}. {name}")


Generated English names:
 1. iveria
 2. alla
 3. lariah
 4. kashan
 5. leona
 6. alva
 7. talissa
 8. dissal
 9. zendra
10. jaeline


# RNN Machine Translation. Decoder. №4