In [1]:
import os
import pandas as pd
from src.data_utils import clean_text
from src.next_token_dataset import CustomDataset, collate_fn
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read txt-file
with open('data/raw_dataset.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

# delete \n
texts = [line.strip() for line in lines if line.strip()]

# clean data
cleaned_dataset = [clean_text(text) for text in texts]


In [3]:
cleaned_dataset[:5]

[' switchfoot http twitpic com 2y1zl awww that s a bummer you shoulda got david carr of third day to do it d',
 'is upset that he can t update his facebook by texting it and might cry as a result school today also blah ',
 ' kenichan i dived many times for the ball managed to save 50 the rest go out of bounds',
 'my whole body feels itchy and like its on fire',
 ' nationwideclass no it s not behaving at all i m mad why am i here because i can t see you all over there ']

In [4]:
# save clearned txt

output_path = 'data/cleaned_data.txt'

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(cleaned_dataset))

In [5]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [6]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': ['PAD']})

pad_id = tokenizer.pad_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
vocab_size = tokenizer.vocab_size

print(f'pad_id - {pad_id}, sep_id - {sep_id}, unk_id - {unk_id}, vocab_size - {vocab_size}')

pad_id - 0, sep_id - 102, unk_id - 100, vocab_size - 30522


In [7]:
# tokenization

tokenized_data = tokenizer(cleaned_dataset, add_special_tokens=False, return_attention_mask=False)
all_ids = tokenized_data['input_ids']

In [8]:
print(all_ids[0][:20])

[6942, 13064, 8299, 1056, 9148, 25856, 2594, 4012, 1016, 2100, 2487, 2480, 2140, 22091, 2860, 2860, 2008, 1055, 1037, 26352]


In [9]:
train_sents, test_sents = train_test_split(all_ids, test_size=0.1, random_state=42)
train_sents, val_sents = train_test_split(train_sents, test_size=0.1, random_state=42)

In [10]:
# output_paths = ['data/train_data.txt', 'data/val_data.txt', 'data/test_data.txt']
# datasets = [train_sents, val_sents, test_sents]

# for num, path in enumerate(output_paths):

#     os.makedirs(os.path.dirname(path), exist_ok=True)

#     with open(path, 'w', encoding='utf-8') as f:
#         sents = ['\n'.join(str(sent)) for sent in datasets[num]]
#         f.write('\n'.join(sents))

In [11]:
print(f'Train size {len(train_sents)}, val size {len(val_sents)}, test_size {len(test_sents)}')

Train size 1296403, val size 144045, test_size 160050


In [12]:
def build_blocks_from_stream(sequences: list[list[int]],
                             seq_len: int,
                             sep_id: int = None,
                             step: int = None
                             ) -> tuple[list[list[int]], list[list[int]]]:
    """ 

    """
    stream = []
    for seq in sequences:
        if len(seq) == 0:
            continue
        stream.extend(seq)
        if sep_id is not None:
            stream.append(sep_id)
    if len(stream) < 2:
        return [], []
    
    if step is None:
        step = seq_len

    inputs, targets = [], []
    for i in range(0, len(stream)-1, step):
        inp = stream[i:i+seq_len]
        tgt = stream[i+1:i+seq_len+1]
        if len(inp) == 0:
            continue
        inputs.append(inp) 
        targets.append(tgt)
    return inputs, targets

In [13]:
SEQ_LEN = 64
STEP = 64

train_inputs, train_targets = build_blocks_from_stream(train_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)
val_inputs, val_targets = build_blocks_from_stream(val_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)
test_inputs, test_targets = build_blocks_from_stream(test_sents, seq_len=SEQ_LEN, sep_id=sep_id, step=STEP)

print(f'Blocks {len(train_inputs)}, {len(val_inputs)}, {len(test_inputs)}')

Blocks 362389, 40228, 44662


In [14]:
BATCH_SIZE = 8

In [15]:
train_ds = CustomDataset(train_inputs, train_targets)
val_ds = CustomDataset(val_inputs, val_targets)
test_ds = CustomDataset(test_inputs, test_targets)

In [16]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [17]:
print(
    f'Loader is ready. Example batches: {len(train_ds)}'
)

Loader is ready. Example batches: 362389


In [18]:
from src.lstm_model import RNN
import torch

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(vacab_size=tokenizer.vocab_size, emb_dim=128, hidden=256, padding_idx=pad_id).to(device)
criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
def train_epochs(model, loader):
    model.train()
    total_loss = 0

    for inps_b, tgts_b, mask_b in loader:
        print(type(inps_b))
        inps_b = inps_b.to(device)
        tgts_b = tgts_b.to(device)
        logits = model(inps_b)

        optimizer.zero_grad()
        logits = model(inps_b)
        loss = criterion(logits, tgts_b)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)


In [22]:
def evaluate(model, loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for inps_b, tgts_b, _ in loader:
            inps_b = inps_b.to(device)
            tgts_b = tgts_b.to(device)
            logits = model(inps_b)
            preds += torch.argmax(logits, dim=1).cpu().tolist()
            trues += tgts_b.tolist()
    return accuracy_score(trues, preds)

In [23]:
# for epoch in range(10):
#     loss = train_epochs(model, train_loader)
#     acc = evaluate(model, val_loader)
#     print(f'Epoch {epoch+1}: Loss = {loss:.4f}, Accuracy = {acc:,4}')

In [24]:
import torch
from sklearn.metrics import accuracy_score

# предполагается, что:
# model, optimizer, criterion, device, pad_id уже определены
# train_loader, val_loader — DataLoader, возвращающие (inps, tgts, mask)
# inps: [B, L], tgts: [B, L], mask: [B, L] (bool) where mask = (tgts != pad_id)

def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_tokens = 0  # для усреднения по токенам (не включая паддинги)

    for inps_b, tgts_b, mask_b in loader:
        # Переводим на устройство
        inps_b = inps_b.to(device)        # [B, L]
        tgts_b = tgts_b.to(device)        # [B, L]
        mask_b = mask_b.to(device)        # [B, L] bool

        optimizer.zero_grad()
        logits = model(inps_b)            # ожидаем [B, L, V]
        if logits.dim() != 3:
            raise RuntimeError(f"Expected model output [B,L,V], got {logits.shape}")

        B, L, V = logits.size()

        # CrossEntropyLoss expects (N, C) and targets (N,)
        # Складываем первые два измерения: (B*L, V), targets (B*L,)
        logits_flat = logits.view(B * L, V)            # [B*L, V]
        targets_flat = tgts_b.view(B * L)              # [B*L]

        loss = criterion(logits_flat, targets_flat)    # ignore_index=pad_id учтёт паддинг
        loss.backward()
        optimizer.step()

        # Считаем метрики: суммируем loss (scale по батчам)
        # Для корректного усреднения берём количество ненулевых токенов (mask)
        n_tokens = mask_b.sum().item()
        total_loss += loss.item() * n_tokens           # loss суммирован по токенам
        total_tokens += n_tokens

    avg_loss = total_loss / total_tokens if total_tokens > 0 else float('nan')
    return avg_loss


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    all_preds = []
    all_trues = []

    with torch.no_grad():
        for inps_b, tgts_b, mask_b in loader:
            inps_b = inps_b.to(device)
            tgts_b = tgts_b.to(device)
            mask_b = mask_b.to(device)

            logits = model(inps_b)               # [B, L, V]
            B, L, V = logits.size()

            logits_flat = logits.view(B * L, V)
            targets_flat = tgts_b.view(B * L)

            loss = criterion(logits_flat, targets_flat)
            n_tokens = mask_b.sum().item()
            total_loss += loss.item() * n_tokens
            total_tokens += n_tokens

            # Предсказания: argmax по измерению словаря -> shape [B, L]
            preds = torch.argmax(logits, dim=2).cpu()   # [B, L]
            trues = tgts_b.cpu()                        # [B, L]

            # Добавляем только непаддинговые позиции в списки для accuracy
            mask_cpu = mask_b.cpu()
            # флаттеним и фильтруем
            preds_flat = preds.view(-1)
            trues_flat = trues.view(-1)
            mask_flat = mask_cpu.view(-1)
            if mask_flat.sum().item() > 0:
                all_preds.extend(preds_flat[mask_flat].tolist())
                all_trues.extend(trues_flat[mask_flat].tolist())

    avg_loss = total_loss / total_tokens if total_tokens > 0 else float('nan')
    accuracy = accuracy_score(all_trues, all_preds) if len(all_trues) > 0 else float('nan')
    return avg_loss, accuracy


# Пример использования в цикле обучения
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}  TrainLoss={train_loss:.6f}  ValLoss={val_loss:.6f}  ValAcc={val_acc:.4f}")

  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 1/10  TrainLoss=5.251078  ValLoss=5.112060  ValAcc=0.2110


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 2/10  TrainLoss=5.063306  ValLoss=5.068559  ValAcc=0.2152


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 3/10  TrainLoss=5.022499  ValLoss=5.049219  ValAcc=0.2170


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 4/10  TrainLoss=5.003410  ValLoss=5.041829  ValAcc=0.2178


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 5/10  TrainLoss=4.992799  ValLoss=5.037931  ValAcc=0.2184


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 6/10  TrainLoss=4.986146  ValLoss=5.034297  ValAcc=0.2189


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 7/10  TrainLoss=4.982017  ValLoss=5.036502  ValAcc=0.2191


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 8/10  TrainLoss=4.978601  ValLoss=5.037305  ValAcc=0.2193


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 9/10  TrainLoss=4.975610  ValLoss=5.029607  ValAcc=0.2197


  inputs = [torch.tensor(x[0], dtype=torch.long) for x in batch]
  targets = [torch.tensor(x[1], dtype=torch.long) for x in batch]


Epoch 10/10  TrainLoss=4.972249  ValLoss=5.030882  ValAcc=0.2199
