In [None]:
import os
import torch
from torch.utils.data import DataLoader

import numpy as np

from src.data_utils import clean_text
from src.next_token_dataset import CustomDataset, make_collate_fn
from src.eval_lstm import compute_rouge
from src.lstm_model import RNN
from src.train_loop import train_epoch, evaluate

from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from datasets import Dataset

print(torch.cuda.is_available())

In [122]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]


In [123]:
cleaned_dataset[:5]

[" user url awww that's a bummer you shoulda got david carr of third day to do it d",
 "is upset that he can't update his facebook by texting it and might cry as a result school today also blah ",
 ' user i dived many times for the ball managed to save 50 the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 " user no it's not behaving at all i'm mad why am i here because i can't see you all over there "]

In [124]:
# save clearned txt

output_path = 'data/cleaned_data.txt'

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_dataset))

In [125]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [126]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
vocab_size = tokenizer.vocab_size

print(f'pad_id - {pad_id}, sep_id - {sep_id}, unk_id - {unk_id}, vocab_size - {vocab_size}')

pad_id - 0, sep_id - 102, unk_id - 100, vocab_size - 30522


In [127]:
ds = Dataset.from_dict({"text": cleaned_dataset})

In [128]:
# токенизация батчами, без добавления [CLS]/[SEP] в каждую строку
def tokenize_fn(batch):
    return tokenizer(batch["text"], add_special_tokens=False)

In [None]:
ds_tok = ds.map(
    lambda batch: tokenizer(batch["text"], add_special_tokens=False),
    batched=True,
    batch_size=1000,
    remove_columns=["text"]
)

ds_tok.set_format(type="torch")
ds_tok.save_to_disk("data/tokenized_dataset")

Map:   0%|          | 0/1600498 [00:00<?, ? examples/s]

Map: 100%|██████████| 1600498/1600498 [00:41<00:00, 38133.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1600498/1600498 [00:00<00:00, 2981927.65 examples/s]


In [130]:
print(ds_tok[0]["input_ids"][:20])
print(tokenizer.decode(ds_tok[0]["input_ids"][:20]))

tensor([ 5310, 24471,  2140, 22091,  2860,  2860,  2008,  1005,  1055,  1037,
        26352,  5017,  2017,  2323,  2050,  2288,  2585, 12385,  1997,  2353])
user url awww that ' s a bummer you shoulda got david carr of third


In [None]:
all_ids = ds_tok['input_ids']

try:
    N = len(all_ids)
except Exception:
    all_ids = list(all_ids)
    N = len(all_ids)

print("Total sequences:", N)

Total sequences: 1600498


In [None]:
indices = np.arange(N)
train_idx, test_idx = train_test_split(indices, test_size=0.10, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.111111, random_state=42)

print("Counts (indices):", len(train_idx), len(val_idx), len(test_idx))

In [136]:
train_ds = ds_tok.select(train_idx)
val_ds   = ds_tok.select(val_idx)
test_ds  = ds_tok.select(test_idx)

In [137]:
os.makedirs("data", exist_ok=True)

# сохраняем
train_ds.save_to_disk("data/train_ds")
val_ds.save_to_disk("data/val_ds")
test_ds.save_to_disk("data/test_ds")

Saving the dataset (1/1 shards): 100%|██████████| 1280398/1280398 [00:08<00:00, 151945.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:01<00:00, 153994.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:01<00:00, 159443.63 examples/s]


In [None]:
# Переводим в torch формат

train_ds.set_format(type="torch", columns=["input_ids"])
val_ds.set_format(type="torch", columns=["input_ids"])
test_ds.set_format(type="torch", columns=["input_ids"])

train_dataset = CustomDataset(train_ds)
val_dataset   = CustomDataset(val_ds)
test_dataset   = CustomDataset(test_ds)

pad_id = tokenizer.pad_token_id
collate = make_collate_fn(pad_id=pad_id)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=128, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=128, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)

print(f'Loader is ready. Example batches: {len(train_ds)}')

In [145]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=256, hidden=512, padding_idx=pad_id).to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002531905600554039)

In [150]:
# Пример использования в цикле обучения
num_epochs = 5
for epoch in range(num_epochs):
    print(device)
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Обучение {epoch+1} прошло, оцениваем val_loss')
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Оценили val_los на {epoch+1} эпохе, считаем rouge_scores')
    

    print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f} ")

cuda
Обучение 1 прошло, оцениваем val_loss
Оценили val_los на 1 эпохе, считаем rouge_scores
Epoch 1/5  TrainLoss=4.773986  ValLoss=4.657450 
cuda
Обучение 2 прошло, оцениваем val_loss
Оценили val_los на 2 эпохе, считаем rouge_scores
Epoch 2/5  TrainLoss=4.755271  ValLoss=4.640446 
cuda
Обучение 3 прошло, оцениваем val_loss
Оценили val_los на 3 эпохе, считаем rouge_scores
Epoch 3/5  TrainLoss=4.742275  ValLoss=4.640017 
cuda
Обучение 4 прошло, оцениваем val_loss
Оценили val_los на 4 эпохе, считаем rouge_scores
Epoch 4/5  TrainLoss=4.732778  ValLoss=4.627054 
cuda
Обучение 5 прошло, оцениваем val_loss
Оценили val_los на 5 эпохе, считаем rouge_scores
Epoch 5/5  TrainLoss=4.726251  ValLoss=4.627170 


In [151]:
rouge_scores = compute_rouge(model, val_loader, tokenizer, device, pad_id)
print(f"ROUGE-1={rouge_scores['rouge1']:.4f}  ROUGE-2={rouge_scores['rouge2']:.4f}  ROUGE-L={rouge_scores['rougeL']:.4f}")

ROUGE-1=0.0020  ROUGE-2=0.0002  ROUGE-L=0.0020


In [152]:
# Путь для сохранения
model_path = "models/rnn_lm_3.pt"

# Сохраняем веса
torch.save(model.state_dict(), model_path)

# Для загрузки
# 1. создаём объект модели с такой же архитектурой
loaded_model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=128, hidden=256, padding_idx=pad_id).to(device)

# 2. загружаем веса
loaded_model.load_state_dict(torch.load(model_path, map_location=device))
loaded_model.eval() 

  loaded_model.load_state_dict(torch.load(model_path, map_location=device))


RuntimeError: Error(s) in loading state_dict for RNN:
	size mismatch for emb.weight: copying a param with shape torch.Size([30522, 256]) from checkpoint, the shape in current model is torch.Size([30522, 128]).
	size mismatch for rnn.weight_ih_l0: copying a param with shape torch.Size([2048, 256]) from checkpoint, the shape in current model is torch.Size([1024, 128]).
	size mismatch for rnn.weight_hh_l0: copying a param with shape torch.Size([2048, 512]) from checkpoint, the shape in current model is torch.Size([1024, 256]).
	size mismatch for rnn.bias_ih_l0: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1024]).
	size mismatch for rnn.bias_hh_l0: copying a param with shape torch.Size([2048]) from checkpoint, the shape in current model is torch.Size([1024]).
	size mismatch for norm.weight: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for norm.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([256]).
	size mismatch for out.weight: copying a param with shape torch.Size([30522, 512]) from checkpoint, the shape in current model is torch.Size([30522, 256]).

In [118]:
seed = train_ds[0]["input_ids"][:5]  

# генерируем 30 токенов
gen_tokens = loaded_model.generate(seed, max_len=30, temperature=1.0, pad_id=pad_id, device=device)

# обратно в текст
generated_text = tokenizer.decode(gen_tokens)
print(tokenizer.decode(seed))
print(generated_text)

user facebook iphone app is


user facebook iphone app is on aim anymore luckily i have a blackberry bold but its not available yet either sometimes they never seem to be happening anymore its not available anywhere near me anymore


In [35]:
import optuna

In [42]:
def objective(trial):
    hidden = trial.suggest_categorical('hidden', [128, 256])
    emb = trial.suggest_categorical('emb', [64, 128])
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
    bs = trial.suggest_categorical('batch_size', [64, 128])

    print(f'hidden - {hidden}, emb - {emb}, lr - {lr}, bs - {bs}')

    model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=emb, hidden=hidden, padding_idx=pad_id).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, collate_fn=collate)
    val_loader = DataLoader(val_ds, batch_size=bs, collate_fn=collate)

    # тренируем 2-3 эпохи для оценки
    for epoch in range(3):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)

    val_loss, _ = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f} ")
    
    return val_loss

In [43]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # 20 экспериментов
print(study.best_params)

[I 2025-11-07 18:49:21,133] A new study created in memory with name: no-name-973431bd-507a-43ae-b899-7b770b774ed4
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)


hidden - 256, emb - 64, lr - 0.0017035508745426862, bs - 128


[I 2025-11-07 19:47:44,809] Trial 0 finished with value: 4.803939693408152 and parameters: {'hidden': 256, 'emb': 64, 'lr': 0.0017035508745426862, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=4.990696  ValLoss=4.803940 
hidden - 256, emb - 64, lr - 0.0001526215550204981, bs - 64


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 20:16:21,679] Trial 1 finished with value: 5.0267214323454485 and parameters: {'hidden': 256, 'emb': 64, 'lr': 0.0001526215550204981, 'batch_size': 64}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.237299  ValLoss=5.026721 
hidden - 128, emb - 64, lr - 0.0023673698576573182, bs - 64


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 20:36:08,485] Trial 2 finished with value: 4.94033248873874 and parameters: {'hidden': 128, 'emb': 64, 'lr': 0.0023673698576573182, 'batch_size': 64}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.146778  ValLoss=4.940332 
hidden - 128, emb - 64, lr - 0.00024138561861527735, bs - 64


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 20:56:30,589] Trial 3 finished with value: 5.112500692681913 and parameters: {'hidden': 128, 'emb': 64, 'lr': 0.00024138561861527735, 'batch_size': 64}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.349908  ValLoss=5.112501 
hidden - 256, emb - 64, lr - 0.00017227434526483054, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 21:22:43,093] Trial 4 finished with value: 5.037428610973325 and parameters: {'hidden': 256, 'emb': 64, 'lr': 0.00017227434526483054, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.245498  ValLoss=5.037429 
hidden - 256, emb - 128, lr - 0.00047277333714852574, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 21:48:34,038] Trial 5 finished with value: 4.847082013499232 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.00047277333714852574, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.047105  ValLoss=4.847082 
hidden - 128, emb - 128, lr - 0.001615389305921212, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 22:08:52,256] Trial 6 finished with value: 4.908108684507977 and parameters: {'hidden': 128, 'emb': 128, 'lr': 0.001615389305921212, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.130844  ValLoss=4.908109 
hidden - 128, emb - 128, lr - 0.0005478540553670885, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 22:29:24,902] Trial 7 finished with value: 4.984924593742334 and parameters: {'hidden': 128, 'emb': 128, 'lr': 0.0005478540553670885, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.222438  ValLoss=4.984925 
hidden - 128, emb - 64, lr - 0.004762147342500464, bs - 64


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 22:50:29,415] Trial 8 finished with value: 4.973390865043016 and parameters: {'hidden': 128, 'emb': 64, 'lr': 0.004762147342500464, 'batch_size': 64}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.175940  ValLoss=4.973391 
hidden - 128, emb - 64, lr - 0.00018677366964054245, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 23:09:10,645] Trial 9 finished with value: 5.180746935363222 and parameters: {'hidden': 128, 'emb': 64, 'lr': 0.00018677366964054245, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.424884  ValLoss=5.180747 
hidden - 256, emb - 128, lr - 0.0061540248639914315, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 23:33:04,641] Trial 10 finished with value: 4.8612921114637455 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.0061540248639914315, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.034828  ValLoss=4.861292 
hidden - 256, emb - 128, lr - 0.0006468897805320586, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-07 23:57:04,339] Trial 11 finished with value: 4.821091150964733 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.0006468897805320586, 'batch_size': 128}. Best is trial 0 with value: 4.803939693408152.


Epoch 3/3  TrainLoss=5.018331  ValLoss=4.821091 
hidden - 256, emb - 128, lr - 0.0009125617229595656, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 00:21:39,014] Trial 12 finished with value: 4.796645478445817 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.0009125617229595656, 'batch_size': 128}. Best is trial 12 with value: 4.796645478445817.


Epoch 3/3  TrainLoss=4.989104  ValLoss=4.796645 
hidden - 256, emb - 128, lr - 0.001497242640437095, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 00:46:07,080] Trial 13 finished with value: 4.772119694380594 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.001497242640437095, 'batch_size': 128}. Best is trial 13 with value: 4.772119694380594.


Epoch 3/3  TrainLoss=4.960064  ValLoss=4.772120 
hidden - 256, emb - 128, lr - 0.00272942869848902, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 01:10:56,974] Trial 14 finished with value: 4.77304741113222 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.00272942869848902, 'batch_size': 128}. Best is trial 13 with value: 4.772119694380594.


Epoch 3/3  TrainLoss=4.955103  ValLoss=4.773047 
hidden - 256, emb - 128, lr - 0.003632813230835672, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 01:35:34,987] Trial 15 finished with value: 4.78775882877082 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.003632813230835672, 'batch_size': 128}. Best is trial 13 with value: 4.772119694380594.


Epoch 3/3  TrainLoss=4.967879  ValLoss=4.787759 
hidden - 256, emb - 128, lr - 0.009046847475227915, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 02:00:15,913] Trial 16 finished with value: 4.96074625543443 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.009046847475227915, 'batch_size': 128}. Best is trial 13 with value: 4.772119694380594.


Epoch 3/3  TrainLoss=5.124602  ValLoss=4.960746 
hidden - 256, emb - 128, lr - 0.002531905600554039, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 02:24:59,805] Trial 17 finished with value: 4.768335079518013 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.002531905600554039, 'batch_size': 128}. Best is trial 17 with value: 4.768335079518013.


Epoch 3/3  TrainLoss=4.952781  ValLoss=4.768335 
hidden - 256, emb - 128, lr - 0.0012421043864707787, bs - 64


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 02:53:00,996] Trial 18 finished with value: 4.788338110526319 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.0012421043864707787, 'batch_size': 64}. Best is trial 17 with value: 4.768335079518013.


Epoch 3/3  TrainLoss=4.977030  ValLoss=4.788338 
hidden - 256, emb - 128, lr - 0.0003026424639703648, bs - 128


  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
[I 2025-11-08 03:17:44,308] Trial 19 finished with value: 4.8934129484978435 and parameters: {'hidden': 256, 'emb': 128, 'lr': 0.0003026424639703648, 'batch_size': 128}. Best is trial 17 with value: 4.768335079518013.


Epoch 3/3  TrainLoss=5.097664  ValLoss=4.893413 
{'hidden': 256, 'emb': 128, 'lr': 0.002531905600554039, 'batch_size': 128}


In [44]:
print(study.best_params)

{'hidden': 256, 'emb': 128, 'lr': 0.002531905600554039, 'batch_size': 128}
