In [None]:
import os
import pandas as pd
from src.data_utils import clean_text
from src.next_token_dataset import CustomDataset, make_collate_fn
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from datasets import Dataset
from src.eval_lstm import compute_rouge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]


In [3]:
cleaned_dataset[:5]

[" user url awww that's a bummer you shoulda got david carr of third day to do it d",
 "is upset that he can't update his facebook by texting it and might cry as a result school today also blah ",
 ' user i dived many times for the ball managed to save 50 the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 " user no it's not behaving at all i'm mad why am i here because i can't see you all over there "]

In [4]:
# save clearned txt

output_path = 'data/cleaned_data.txt'

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_dataset))

In [5]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [6]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
vocab_size = tokenizer.vocab_size

print(f'pad_id - {pad_id}, sep_id - {sep_id}, unk_id - {unk_id}, vocab_size - {vocab_size}')

pad_id - 0, sep_id - 102, unk_id - 100, vocab_size - 30522


In [7]:
ds = Dataset.from_dict({"text": cleaned_dataset})

In [8]:
# токенизация батчами, без добавления [CLS]/[SEP] в каждую строку
def tokenize_fn(batch):
    return tokenizer(batch["text"], add_special_tokens=False)

In [9]:
ds_tok = ds.map(
    lambda batch: tokenizer(batch["text"], add_special_tokens=False),
    batched=True,
    batch_size=1000,           # регулируй в зависимости от RAM
    remove_columns=["text"]
)

ds_tok.set_format(type="torch")
ds_tok.save_to_disk("data/tokenized_dataset")

Map: 100%|██████████| 1600498/1600498 [00:40<00:00, 39262.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1600498/1600498 [00:00<00:00, 3213324.34 examples/s]


In [10]:
print(ds_tok[0]["input_ids"][:20])
print(tokenizer.decode(ds_tok[0]["input_ids"][:20]))

tensor([ 5310, 24471,  2140, 22091,  2860,  2860,  2008,  1005,  1055,  1037,
        26352,  5017,  2017,  2323,  2050,  2288,  2585, 12385,  1997,  2353])
user url awww that ' s a bummer you shoulda got david carr of third


In [11]:
all_ids = ds_tok['input_ids']

In [12]:
try:
    N = len(all_ids)
except Exception:
    all_ids = list(all_ids)  # если маленький — ok
    N = len(all_ids)

print("Total sequences:", N)

Total sequences: 1600498


In [13]:
import numpy as np

In [14]:
indices = np.arange(N)
train_idx, test_idx = train_test_split(indices, test_size=0.10, random_state=42)
train_idx, val_idx = train_test_split(train_idx, test_size=0.111111, random_state=42)

In [15]:
print("Counts (indices):", len(train_idx), len(val_idx), len(test_idx))

Counts (indices): 1280398 160050 160050


In [16]:
train_ds = ds_tok.select(train_idx)
val_ds   = ds_tok.select(val_idx)
test_ds  = ds_tok.select(test_idx)

In [17]:
os.makedirs("data", exist_ok=True)

# сохраняем
train_ds.save_to_disk("data/train_ds")
val_ds.save_to_disk("data/val_ds")
test_ds.save_to_disk("data/test_ds")

Saving the dataset (1/1 shards): 100%|██████████| 1280398/1280398 [00:07<00:00, 170725.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:00<00:00, 210315.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 160050/160050 [00:00<00:00, 169625.33 examples/s]


In [18]:
# Переводим в torch формат

train_ds.set_format(type="torch", columns=["input_ids"])
val_ds.set_format(type="torch", columns=["input_ids"])
test_ds.set_format(type="torch", columns=["input_ids"])

In [19]:
SEQ_LEN = 64
STEP = 64
BATCH_SIZE = 8

In [20]:
train_dataset = CustomDataset(train_ds)
val_dataset   = CustomDataset(val_ds)
test_dataset   = CustomDataset(test_ds)

In [21]:
pad_id = tokenizer.pad_token_id

In [22]:
collate = make_collate_fn(pad_id=pad_id)

In [23]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=64, shuffle=False, collate_fn=collate, num_workers=0, pin_memory=True)

In [24]:
print(
    f'Loader is ready. Example batches: {len(train_ds)}'
)

Loader is ready. Example batches: 1280398


In [25]:
from src.lstm_model import RNN
import torch

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=128, hidden=256, padding_idx=pad_id).to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
def train_epochs(model, loader):
    model.train()
    total_loss = 0

    for inps_b, tgts_b, mask_b in loader:
        print(type(inps_b))
        inps_b = inps_b.to(device)
        tgts_b = tgts_b.to(device)
        logits = model(inps_b)

        optimizer.zero_grad()
        logits = model(inps_b)
        loss = criterion(logits, tgts_b)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)


In [29]:
def evaluate(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for inps_b, tgts_b, _ in loader:
            inps_b = inps_b.to(device)
            tgts_b = tgts_b.to(device)
            logits = model(inps_b)
            preds += torch.argmax(logits, dim=1).cpu().tolist()
            trues += tgts_b.tolist()
    return accuracy_score(trues, preds)

In [30]:
# for epoch in range(10):
#     loss = train_epochs(model, train_loader)
#     acc = evaluate(model, val_loader)
#     print(f'Epoch {epoch+1}: Loss = {loss:.4f}, Accuracy = {acc:,4}')

In [None]:
import torch
from sklearn.metrics import accuracy_score

# предполагается, что:
# model, optimizer, criterion, device, pad_id уже определены
# train_loader, val_loader — DataLoader, возвращающие (inps, tgts, mask)
# inps: [B, L], tgts: [B, L], mask: [B, L] (bool) where mask = (tgts != pad_id)

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_tokens = 0  # для усреднения по токенам (не включая паддинги)

    for inps_b, tgts_b, mask_b, _ in loader:
        # Переводим на устройство
        inps_b = inps_b.to(device)        # [B, L]
        tgts_b = tgts_b.to(device)        # [B, L]
        mask_b = mask_b.to(device)        # [B, L] bool

        optimizer.zero_grad()
        logits = model(inps_b)            # ожидаем [B, L, V]
        if logits.dim() != 3:
            raise RuntimeError(f"Expected model output [B,L,V], got {logits.shape}")

        B, L, V = logits.size()

        # CrossEntropyLoss expects (N, C) and targets (N,)
        # Складываем первые два измерения: (B*L, V), targets (B*L,)
        logits_flat = logits.view(B * L, V)            # [B*L, V]
        targets_flat = tgts_b.view(B * L)              # [B*L]

        loss = criterion(logits_flat, targets_flat)    # ignore_index=pad_id учтёт паддинг
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Считаем метрики: суммируем loss (scale по батчам)
        # Для корректного усреднения берём количество ненулевых токенов (mask)
        n_tokens = mask_b.sum().item()
        total_loss += loss.item() * n_tokens           # loss суммирован по токенам
        total_tokens += n_tokens

    avg_loss = total_loss / total_tokens if total_tokens > 0 else float('nan')
    return avg_loss


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    all_preds = []
    all_trues = []

    with torch.no_grad():
        for inps_b, tgts_b, mask_b, _ in loader:
            inps_b = inps_b.to(device)
            tgts_b = tgts_b.to(device)
            mask_b = mask_b.to(device)

            logits = model(inps_b)               # [B, L, V]
            B, L, V = logits.size()

            logits_flat = logits.view(B * L, V)
            targets_flat = tgts_b.view(B * L)

            loss = criterion(logits_flat, targets_flat)
            n_tokens = mask_b.sum().item()
            total_loss += loss.item() * n_tokens
            total_tokens += n_tokens

            # Предсказания: argmax по измерению словаря -> shape [B, L]
            preds = torch.argmax(logits, dim=2).cpu()   # [B, L]
            trues = tgts_b.cpu()                        # [B, L]

            # Добавляем только непаддинговые позиции в списки для accuracy
            mask_cpu = mask_b.cpu()
            # флаттеним и фильтруем
            preds_flat = preds.view(-1)
            trues_flat = trues.view(-1)
            mask_flat = mask_cpu.view(-1)
            if mask_flat.sum().item() > 0:
                all_preds.extend(preds_flat[mask_flat].tolist())
                all_trues.extend(trues_flat[mask_flat].tolist())

    avg_loss = total_loss / total_tokens if total_tokens > 0 else float('nan')
    accuracy = accuracy_score(all_trues, all_preds) if len(all_trues) > 0 else float('nan')
    return avg_loss, accuracy


# Пример использования в цикле обучения
num_epochs = 1
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    rouge_scores = compute_rouge(model, val_loader, tokenizer, device, pad_id)

    print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f} "
          f"ROUGE-1={rouge_scores['rouge1']:.4f}  ROUGE-2={rouge_scores['rouge2']:.4f}  ROUGE-L={rouge_scores['rougeL']:.4f}")


    #print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f}  ValAcc={val_acc:.4f}")

In [1]:
# Путь для сохранения
model_path = "models/rnn_lm.pt"

# Сохраняем веса
torch.save(model.state_dict(), model_path)

# Для загрузки
# 1. создаём объект модели с такой же архитектурой
loaded_model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=128, hidden=256, padding_idx=pad_id).to(device)

# 2. загружаем веса
loaded_model.load_state_dict(torch.load(model_path, map_location=device))
loaded_model.eval() 

NameError: name 'torch' is not defined

In [None]:
seed = train_ds[0]["input_ids"][:5]  

# генерируем 30 токенов
gen_tokens = loaded_model.generate(seed, max_len=30, temperature=1.0, pad_id=pad_id, device=device)

# обратно в текст
generated_text = tokenizer.decode(gen_tokens)
print(tokenizer.decode(seed))
print(generated_text)

user facebook iphone app is
user facebook iphone app is working 3 years ago still surviving ink you ' re making me see the companies more effort to even need 4 a life jealous not hear that u still pick


In [2]:
import optuna

ModuleNotFoundError: No module named 'optuna'

In [None]:
def objective(trial):
    hidden = trial.suggest_categorical('hidden', [128, 256])
    emb = trial.suggest_categorical('emb', [64, 128])
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-2)
    bs = trial.suggest_categorical('batch_size', [64, 128])

    model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=emb, hidden=hidden, padding_idx=pad_id).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=bs, collate_fn=collate_fn)

    # тренируем 2-3 эпохи для оценки
    for epoch in range(2):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    
    val_loss, _ = evaluate(model, val_loader, criterion, device)
    return val_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # 20 экспериментов
print(study.best_params)