## Загрузим необходимые библиотеки

In [None]:
import os
import torch
from torch.utils.data import DataLoader

import numpy as np

from src.data_utils import clean_text
from src.next_token_dataset import CustomDataset, make_collate_fn
from src.eval_lstm import compute_rouge
from src.lstm_model import RNN
from src.train_loop import train_epoch, evaluate
from src.hyper_pram_optim import make_objective

from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from datasets import Dataset

import optuna

print(torch.cuda.is_available())

  from .autonotebook import tqdm as notebook_tqdm


True


## Загрузим и очистим данные

In [2]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]

cleaned_dataset[:5]

[" user url awww that's a bummer you shoulda got david carr of third day to do it d",
 "is upset that he can't update his facebook by texting it and might cry as a result school today also blah ",
 ' user i dived many times for the ball managed to save 50 the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 " user no it's not behaving at all i'm mad why am i here because i can't see you all over there "]

In [3]:
# save clearned txt

output_path = 'data/cleaned_data.txt'

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_dataset))

## Токенизируем данные

In [4]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [5]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
vocab_size = tokenizer.vocab_size

print(f'pad_id - {pad_id}, sep_id - {sep_id}, unk_id - {unk_id}, vocab_size - {vocab_size}')

pad_id - 0, sep_id - 102, unk_id - 100, vocab_size - 30522


In [6]:
ds = Dataset.from_dict({"text": cleaned_dataset})

# токенизация батчами, без добавления [CLS]/[SEP] в каждую строку
def tokenize_fn(batch):
    return tokenizer(batch["text"], add_special_tokens=False)

In [7]:
ds_tok = ds.map(
    lambda batch: tokenizer(batch["text"], add_special_tokens=False),
    batched=True,
    batch_size=1000,
    remove_columns=["text"]
)

ds_tok.set_format(type="torch")
ds_tok.save_to_disk("data/tokenized_dataset")

Map: 100%|██████████| 1600498/1600498 [00:42<00:00, 37658.31 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1600498/1600498 [00:00<00:00, 2990966.10 examples/s]


In [8]:
print(ds_tok[0]["input_ids"][:20])
print(tokenizer.decode(ds_tok[0]["input_ids"][:20]))

tensor([ 5310, 24471,  2140, 22091,  2860,  2860,  2008,  1005,  1055,  1037,
        26352,  5017,  2017,  2323,  2050,  2288,  2585, 12385,  1997,  2353])
user url awww that ' s a bummer you shoulda got david carr of third


In [9]:
all_ids = ds_tok['input_ids']

try:
    N = len(all_ids)
except Exception:
    all_ids = list(all_ids)
    N = len(all_ids)

print("Total sequences:", N)

Total sequences: 1600498


## Разделим данные на train, val и test

In [10]:
indices = np.arange(N)
train_idx, test_idx = train_test_split(indices, test_size=0.10, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.111111, random_state=42)

print("Counts (indices):", len(train_idx), len(val_idx), len(test_idx))

Counts (indices): 1280398 160050 160050


In [11]:
train_ds = ds_tok.select(train_idx)
val_ds   = ds_tok.select(val_idx)
test_ds  = ds_tok.select(test_idx)

In [12]:
os.makedirs("data", exist_ok=True)

# сохраняем
train_ds.save_to_disk("data/train_ds")
val_ds.save_to_disk("data/val_ds")
test_ds.save_to_disk("data/test_ds")

Saving the dataset (1/1 shards): 100%|██████████| 1280398/1280398 [00:08<00:00, 158498.37 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:01<00:00, 157033.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:00<00:00, 210060.77 examples/s]


## Подготовим данные 

In [13]:
# Переводим в torch формат

train_ds.set_format(type="torch", columns=["input_ids"])
val_ds.set_format(type="torch", columns=["input_ids"])
test_ds.set_format(type="torch", columns=["input_ids"])

train_dataset = CustomDataset(train_ds)
val_dataset   = CustomDataset(val_ds)
test_dataset   = CustomDataset(test_ds)

pad_id = tokenizer.pad_token_id
collate = make_collate_fn(pad_id=pad_id)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=128, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=128, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)

print(f'Loader is ready. Example batches: {len(train_ds)}')

Loader is ready. Example batches: 1280398


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=256, hidden=512, padding_idx=pad_id).to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002531905600554039)

## Запустим циклы обучения

In [None]:
num_epochs = 5
for epoch in range(num_epochs):
    print(device)
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Обучение {epoch+1} прошло, оцениваем val_loss')
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Оценили val_los на {epoch+1} эпохе, считаем rouge_scores')
    

    print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f} ")

cuda
Обучение 1 прошло, оцениваем val_loss
Оценили val_los на 1 эпохе, считаем rouge_scores
Epoch 1/5  TrainLoss=4.773986  ValLoss=4.657450 
cuda
Обучение 2 прошло, оцениваем val_loss
Оценили val_los на 2 эпохе, считаем rouge_scores
Epoch 2/5  TrainLoss=4.755271  ValLoss=4.640446 
cuda
Обучение 3 прошло, оцениваем val_loss
Оценили val_los на 3 эпохе, считаем rouge_scores
Epoch 3/5  TrainLoss=4.742275  ValLoss=4.640017 
cuda
Обучение 4 прошло, оцениваем val_loss
Оценили val_los на 4 эпохе, считаем rouge_scores
Epoch 4/5  TrainLoss=4.732778  ValLoss=4.627054 
cuda
Обучение 5 прошло, оцениваем val_loss
Оценили val_los на 5 эпохе, считаем rouge_scores
Epoch 5/5  TrainLoss=4.726251  ValLoss=4.627170 


## Оценим качество модели

In [151]:
rouge_scores = compute_rouge(model, val_loader, tokenizer, device, pad_id)
print(f"ROUGE-1={rouge_scores['rouge1']:.4f}  ROUGE-2={rouge_scores['rouge2']:.4f}  ROUGE-L={rouge_scores['rougeL']:.4f}")

ROUGE-1=0.0020  ROUGE-2=0.0002  ROUGE-L=0.0020


## Сохраним модель

In [None]:
# Путь для сохранения
model_path = "models/rnn_lm_3.pt"

# Сохраняем веса
torch.save(model.state_dict(), model_path)

# Для загрузки
# 1. создаём объект модели с такой же архитектурой
loaded_model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=256, hidden=512, padding_idx=pad_id).to(device)

# 2. загружаем веса
loaded_model.load_state_dict(torch.load(model_path, map_location=device))
loaded_model.eval() 

## Сгенерируем текст

In [118]:
seed = train_ds[0]["input_ids"][:5]  

# генерируем 30 токенов
gen_tokens = loaded_model.generate(seed, max_len=30, temperature=1.0, pad_id=pad_id, device=device)

# обратно в текст
generated_text = tokenizer.decode(gen_tokens)
print(tokenizer.decode(seed))
print(generated_text)

user facebook iphone app is


user facebook iphone app is on aim anymore luckily i have a blackberry bold but its not available yet either sometimes they never seem to be happening anymore its not available anywhere near me anymore


## Код для нахождения гиперпараметров

In [18]:
objective = make_objective(tokenizer, pad_id, collate, train_ds, val_ds, train_epoch, criterion, evaluate, num_epochs=3, device=device)

In [21]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # 20 экспериментов
print(study.best_params)

[I 2025-11-09 13:26:42,329] A new study created in memory with name: no-name-bac1e20e-6e89-4474-8b2c-ca43ed95ae2f
  lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)


hidden - 256, emb - 128, lr - 0.002877720935023081, bs - 64


[W 2025-11-09 13:28:51,830] Trial 0 failed with parameters: {'hidden': 256, 'emb': 128, 'lr': 0.002877720935023081, 'batch_size': 64} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\tatya\OneDrive\Documents\auto-completion-of-texts\venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "c:\Users\tatya\OneDrive\Documents\auto-completion-of-texts\src\hyper_pram_optim.py", line 27, in objective
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tatya\OneDrive\Documents\auto-completion-of-texts\src\train_loop.py", line 35, in train_epoch
    n_tokens = mask_b.sum().item()
               ^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
[W 2025-11-09 13:28:51,854] Trial 0 failed with value None.


KeyboardInterrupt: 

In [44]:
print(study.best_params)

{'hidden': 256, 'emb': 128, 'lr': 0.002531905600554039, 'batch_size': 128}
