In [None]:
import os
from abc import ABC, abstractmethod

import polars as pls
import pytorch_lightning as pl
import tokenizers
import torch
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sacrebleu import corpus_bleu, corpus_chrf
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, Dataset


# Загрузите обучающий и тестовый датасеты.

## Загрузка датасета и его анализ

In [3]:
df = pls.read_csv(
    "https://github.com/0xFEE1DEAD/tatoeba_rus_to_eng/raw/refs/heads/main/tatoeba_rus_to_eng.tsv",
    separator="\t",
    has_header=False,
    new_columns=["ru_id", "ru", "eng_id", "eng"],
    encoding="utf8",
    quote_char=None,
    ignore_errors=True,
    truncate_ragged_lines=True,
    n_rows=15000,
)

In [4]:
df.head()

ru_id,ru,eng_id,eng
i64,str,i64,str
243,"""Один раз в жизни я делаю хорош…",3257,"""For once in my life I'm doing …"
5409,"""Давайте что-нибудь попробуем!""",1276,"""Let's try something."""
5410,"""Мне пора идти спать.""",1277,"""I have to go to sleep."""
5411,"""Что ты делаешь?""",16492,"""What are you doing?"""
5411,"""Что ты делаешь?""",511884,"""What do you make?"""


In [5]:
df.shape

(15000, 4)

In [6]:
df.null_count()

ru_id,ru,eng_id,eng
u32,u32,u32,u32
0,0,0,0


## Подготовка токенайзера

In [7]:
tokenizer = tokenizers.SentencePieceBPETokenizer()
tokenizer.normalizer = tokenizers.normalizers.Sequence(
    [
        tokenizers.normalizers.NFD(),
        tokenizers.normalizers.Lowercase(),
        tokenizers.normalizers.StripAccents(),
    ]
)

tokenizer.train_from_iterator(
    df["ru"].to_list() + df["eng"].to_list(),
    special_tokens=["<s>", "</s>", "<unk>", "<pad>"],
)
tokenizer.save("seq2seq_tokenizer.json")

In [8]:
tokenizer = tokenizers.Tokenizer.from_file("seq2seq_tokenizer.json")

In [9]:
tokenizer.encode("Hello, Привет").ids

[6112, 2745]

In [10]:
class Tokenizer:
    def __init__(self, tokenizer: tokenizers.Tokenizer, max_len: int):
        self.tkz = tokenizer
        self.max_len = max_len

    def encode(self, seq: str) -> list[int]:
        encoded = self.tkz.encode(seq)
        encoded = encoded.ids[: self.max_len - 2]

        if len(encoded) < (self.max_len - 2):
            encoded += [self.tkz.token_to_id("<pad>")] * (self.max_len - len(encoded) - 2)

        return [self.tkz.token_to_id("<s>")] + encoded + [self.tkz.token_to_id("</s>")]

    def decode(self, seq: list[int]) -> str:
        return self.tkz.decode(seq, skip_special_tokens=True)

    def get_pad_token_id(self) -> int:
        return self.tkz.token_to_id("<pad>")

    def get_bos_token_id(self) -> int:
        return self.tkz.token_to_id("<s>")

    def get_eos_token_id(self) -> int:
        return self.tkz.token_to_id("</s>")

    def get_vocab_size(self) -> int:
        return self.tkz.get_vocab_size()

In [11]:
test_my_tokenizer = Tokenizer(tokenizer, 5)
print(test_my_tokenizer.encode("Hello, Привет"))
print(len(test_my_tokenizer.encode("Hello, Привет")))


[0, 6112, 2745, 3, 1]
5


In [12]:
print(test_my_tokenizer.encode("Hello, Привет Привет Привет Привет"))
print(len(test_my_tokenizer.encode("Hello, Привет Привет Привет Привет Привет")))

[0, 6112, 2745, 2745, 1]
5


In [13]:
print(test_my_tokenizer.decode(test_my_tokenizer.encode("Hello, Привет Привет Привет Привет Привет")))

hello, привет привет


In [14]:
complete_tokenizer = Tokenizer(tokenizer, 128)

## Определение класса датасета

In [15]:
class TextDataset(Dataset):
    def __init__(
        self,
        tokenizer: Tokenizer,
        ru_texts: list[str],
        en_texts: list[str],
    ):
        self.ru_texts = ru_texts
        self.en_texts = en_texts
        self.tokenizer = tokenizer

    def __len__(self) -> int:
        return len(self.ru_texts)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        ru_text = self.ru_texts[idx]
        en_text = self.en_texts[idx]

        ru_tokenized = self.tokenizer.encode(ru_text)
        en_tokenized = self.tokenizer.encode(en_text)

        return torch.tensor(ru_tokenized, dtype=torch.long), torch.tensor(en_tokenized, dtype=torch.long)

## Определение датамодуля

In [16]:
class TextDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df: pls.DataFrame,
        tokenizer: Tokenizer,
        batch_size: int = 32,
        num_workers: int = 4,
        val_test_size: float = 0.2,
        test_size: float = 0.5,
        random_state: int = 42,
    ):
        super().__init__()
        self.df = df
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.val_test_size = val_test_size
        self.test_size = test_size
        self.random_state = random_state
        self.tokenizer = tokenizer

        # Отключаем параллелизм tokenizers — предотвращает дедлоки
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

    def setup(self, stage: str | None = None) -> None:
        # Выбираем колонки
        ru_texts = self.df["ru"].cast(pls.Utf8).to_list()
        en_texts = self.df["eng"].cast(pls.Utf8).to_list()

        train_ru, temp_ru, train_en, temp_en = train_test_split(
            ru_texts,
            en_texts,
            test_size=self.val_test_size,
            random_state=self.random_state,
            shuffle=True,
        )
        # Разделение temp на val и test
        val_ru, test_ru, val_en, test_en = train_test_split(
            temp_ru,
            temp_en,
            test_size=self.test_size,
            random_state=self.random_state,
            shuffle=True,
        )

        self.train_dataset = TextDataset(
            ru_texts=train_ru,
            en_texts=train_en,
            tokenizer=self.tokenizer,
        )
        self.val_dataset = TextDataset(
            ru_texts=val_ru,
            en_texts=val_en,
            tokenizer=self.tokenizer,
        )
        self.test_dataset = TextDataset(
            ru_texts=test_ru,
            en_texts=test_en,
            tokenizer=self.tokenizer,
        )

    def train_dataloader(self) -> DataLoader:
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def val_dataloader(self) -> DataLoader:
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
        )

    def test_dataloader(self) -> DataLoader:
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=True,
        )

## Проверка что все работает корректно

In [17]:
dm = TextDataModule(
    tokenizer=complete_tokenizer,
    df=df,
    batch_size=24,
    num_workers=0,  # На Windows >0 приводит к deadlock
)
dm.setup()

In [18]:
torch.manual_seed(42)
batch = next(iter(dm.train_dataloader()))
print("✅ from seq:", batch[0].shape)
print("✅ from seq:", batch[0][0])
print("✅ to seq:", batch[1].shape)
print("✅ to seq:", batch[1][0])

✅ from seq: torch.Size([24, 128])
✅ from seq: tensor([    0,   305,   116,   604,  4333,   604,   685, 19654,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,    

# Обучите модели seq2seq и seq2seq + attention на тренинговом датасете для перевода с одного языка на другой

## Определение класса модели

In [None]:
class DecoderInterface(ABC):
    @abstractmethod
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_size: int,
        padding_idx: int,
        num_layers: int = 1,
        dropout: float = 0.0,
    ) -> None:
        pass

    @abstractmethod
    def forward(
        self,
        x: torch.Tensor,
        hidden: torch.Tensor,
        encoder_outputs: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        pass

In [20]:
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_size: int,
        padding_idx: int,
        num_layers: int = 1,
        dropout: float = 0.0,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        embedded = self.dropout(self.embedding(x))
        outputs, hidden = self.gru(embedded)
        return outputs, hidden


class Decoder(nn.Module, DecoderInterface):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_size: int,
        padding_idx: int,
        num_layers: int = 1,
        dropout: float = 0.0,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
        )
        self.fc_out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(
        self,
        x: torch.Tensor,
        hidden: torch.Tensor,
        encoder_outputs: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.gru(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

In [21]:
class Seq2SeqGRU(pl.LightningModule):
    def __init__(
        self,
        tokenizer: Tokenizer,
        emb_dim: int = 256,
        hidden_size: int = 512,
        num_layers: int = 4,
        dropout: float = 0.3,
        learning_rate: float = 1e-3,
        max_length: int = 128,
        decoder_class: type[DecoderInterface] = Decoder,
    ):
        super().__init__()

        self.encoder = Encoder(
            vocab_size=tokenizer.get_vocab_size(),
            emb_dim=emb_dim,
            hidden_size=hidden_size,
            padding_idx=tokenizer.get_pad_token_id(),
            num_layers=num_layers,
            dropout=dropout,
        )
        self.decoder = decoder_class(
            vocab_size=tokenizer.get_vocab_size(),
            emb_dim=emb_dim,
            hidden_size=hidden_size,
            padding_idx=tokenizer.get_pad_token_id(),
            num_layers=num_layers,
            dropout=dropout,
        )

        self.loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.get_pad_token_id(), label_smoothing=0.1)

        self.max_length = max_length
        self.learning_rate = learning_rate
        self.tokenizer = tokenizer
        self.val_predictions = []
        self.val_references = []

    def translate(self, src_text: str) -> str:
        self.eval()
        with torch.no_grad():
            src_ids = self.tokenizer.encode(src_text)

            src_tensor = torch.tensor(src_ids, dtype=torch.long, device=self.device).unsqueeze(0)

            assert (src_tensor[:, 0] == self.tokenizer.get_bos_token_id()).all(), "src must start with BOS!"
            assert (src_tensor[:, -1] == self.tokenizer.get_eos_token_id()).all(), "src must end with EOS!"

            logits = self(src_tensor)

            tokens = self._prepare_logits(logits)

            return self.tokenizer.decode(tokens[0].tolist())

    def forward(
        self,
        src: torch.Tensor,
        tgt: torch.Tensor | None = None,
        teacher_forcing_ratio: float = 0.4,
    ) -> torch.Tensor:
        batch_size = src.size(0)

        encoder_outputs, hidden = self.encoder(src)

        if tgt is not None:
            tgt_len = tgt.size(1)  # L
            outputs = torch.zeros(batch_size, tgt_len, self.tokenizer.get_vocab_size()).to(src.device)

            input_token = tgt[:, 0].unsqueeze(1)

            for t in range(1, tgt_len):
                # Декодер ожидает [B, 1]
                output, hidden = self.decoder(input_token, hidden, encoder_outputs)  # output: [B, V]
                outputs[:, t] = output

                # Решаем, использовать ли teacher forcing
                teacher_force = torch.rand(()).item() < teacher_forcing_ratio
                input_token = tgt[:, t].unsqueeze(1) if teacher_force else output.argmax(1, keepdim=True)

            return outputs

        outputs = []

        input_token = torch.full(
            (batch_size, 1),
            self.tokenizer.get_bos_token_id(),
            dtype=torch.long,
            device=src.device,
        )
        for _ in range(self.max_length):
            output, hidden = self.decoder(input_token, hidden, encoder_outputs)  # [B, V]
            outputs.append(output)

            input_token = output.argmax(1, keepdim=True)  # [B, 1]

        return torch.stack(outputs, dim=1)

    def training_step(self, batch, batch_idx) -> float:
        src, tgt = batch

        assert (tgt[:, 0] == self.tokenizer.get_bos_token_id()).all(), "tgt must start with BOS!"
        assert (tgt[:, -1] == self.tokenizer.get_eos_token_id()).all(), "tgt must end with EOS!"
        assert (src[:, 0] == self.tokenizer.get_bos_token_id()).all(), "src must start with BOS!"
        assert (src[:, -1] == self.tokenizer.get_eos_token_id()).all(), "src must end with EOS!"

        logits = self(src, tgt, teacher_forcing_ratio=0.65)
        out_to_loss = logits[:, 1:].reshape(-1, logits.size(2))
        tgt_to_loss = tgt[:, 1:].reshape(-1)

        loss = self.loss_fn(out_to_loss, tgt_to_loss)

        self.log("train_loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx) -> float:
        src, tgt = batch

        assert (tgt[:, 0] == self.tokenizer.get_bos_token_id()).all(), "tgt must start with BOS!"
        assert (tgt[:, -1] == self.tokenizer.get_eos_token_id()).all(), "tgt must end with EOS!"
        assert (src[:, 0] == self.tokenizer.get_bos_token_id()).all(), "src must start with BOS!"
        assert (src[:, -1] == self.tokenizer.get_eos_token_id()).all(), "src must end with EOS!"

        logits = self(src, tgt)
        out_to_loss = logits[:, 1:].reshape(-1, logits.size(2))
        tgt_to_loss = tgt[:, 1:].reshape(-1)
        loss = self.loss_fn(out_to_loss, tgt_to_loss)
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self._collect_translations(logits, tgt)
        return loss

    def _prepare_logits(self, logits: torch.Tensor) -> torch.Tensor:
        pred_tokens = logits.argmax(-1)
        # очищаем мусор
        for t in range(pred_tokens.size(0)):
            eos_mask = pred_tokens[t] == self.tokenizer.get_eos_token_id()
            cumsum_eos = eos_mask.cumsum(dim=0)
            pred_tokens[t][cumsum_eos > 0] = self.tokenizer.get_pad_token_id()

        return pred_tokens

    def _collect_translations(self, predictions: torch.Tensor, tgt_tokens: torch.Tensor) -> None:
        """Собирает предсказания и референсы для метрик."""
        pred_tokens = self._prepare_logits(predictions)

        for pred, tgt in zip(pred_tokens, tgt_tokens):
            pred_text = self.tokenizer.decode(pred.tolist())
            tgt_text = self.tokenizer.decode(tgt.tolist())

            self.val_predictions.append(pred_text)
            self.val_references.append(tgt_text)

    def on_validation_epoch_end(self):
        if not self.val_predictions and not self.val_references:
            return

        bleu = corpus_bleu(self.val_predictions, [self.val_references])
        chrf = corpus_chrf(self.val_predictions, [self.val_references])

        self.log("val_bleu", bleu.score, prog_bar=True)
        self.log("val_chrf", chrf.score, prog_bar=True)

        self.val_predictions.clear()
        self.val_references.clear()

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.99)

### Проверка что все корректно работает

In [22]:
s2sgru = Seq2SeqGRU(complete_tokenizer)

In [26]:
with torch.no_grad():
    s2sgru.training_step(batch, 0)
    s2sgru.validation_step(batch, 0)
    s2sgru.on_validation_epoch_end()

Ошибок нет, можно переходить к обучению

## Обучение модели

In [27]:
def start_leaning(
    name: str,
    n_layers: int,
    embed_size: int,
    hidden_size: int,
    datamodule: TextDataModule,
    tokenizer: Tokenizer,
    decoder_class: type[DecoderInterface] = Decoder,
):
    """Запустить обучение модели."""
    torch.manual_seed(42)
    logger = TensorBoardLogger(
        save_dir="logs",
        name=name,
    )
    checkpoint_callback = ModelCheckpoint(
        dirpath=f"checkpoints/{name}",
        filename=f"best-[ls_{n_layers}-es_{embed_size}-hs_{hidden_size}]" + "-{epoch}-{val_loss:.6f}",
        save_top_k=3,
        monitor="val_loss",
        save_last=True,
        verbose=True,
    )
    early_stop_callback = EarlyStopping(monitor="val_loss", patience=5, mode="min", verbose=True)
    model = Seq2SeqGRU(
        tokenizer=tokenizer,
        num_layers=n_layers,
        emb_dim=embed_size,
        hidden_size=hidden_size,
        learning_rate=0.01,
        decoder_class=decoder_class,
    )
    trainer = pl.Trainer(
        max_epochs=50,
        accelerator="auto",
        logger=logger,
        callbacks=[checkpoint_callback, early_stop_callback],
        log_every_n_steps=10,
    )
    trainer.fit(model, datamodule=datamodule)

In [42]:
torch.set_float32_matmul_precision("medium")
start_leaning("simple_seq2seq_gru", 6, 300, 1024, dm, complete_tokenizer)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\dilst\Documents\uni_lessons_da_and_ml\.venv\Lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:701: Checkpoint directory C:\Users\dilst\Documents\uni_lessons_da_and_ml\src\torch_learning\checkpoints\simple_seq2seq_gru exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | encoder | Encoder          | 43.3 M | train
1 | decoder | Decoder          | 69.5 M | train
2 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
112 M     Trainable params
0         Non-trainable params
112 M     Total params
451.158   Total estimated model params size (MB)
10        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\dilst\Documents\uni_lessons_da_and_ml\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
c:\Users\dilst\Documents\uni_lessons_da_and_ml\.venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 6.872
Epoch 0, global step 500: 'val_loss' reached 6.87159 (best 6.87159), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=0-val_loss=6.871591.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.117 >= min_delta = 0.0. New best score: 6.754
Epoch 1, global step 1000: 'val_loss' reached 6.75428 (best 6.75428), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=1-val_loss=6.754280.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.024 >= min_delta = 0.0. New best score: 6.730
Epoch 2, global step 1500: 'val_loss' reached 6.73008 (best 6.73008), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=2-val_loss=6.730083.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.014 >= min_delta = 0.0. New best score: 6.716
Epoch 3, global step 2000: 'val_loss' reached 6.71601 (best 6.71601), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=3-val_loss=6.716013.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.017 >= min_delta = 0.0. New best score: 6.699
Epoch 4, global step 2500: 'val_loss' reached 6.69927 (best 6.69927), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=4-val_loss=6.699272.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.032 >= min_delta = 0.0. New best score: 6.668
Epoch 5, global step 3000: 'val_loss' reached 6.66751 (best 6.66751), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=5-val_loss=6.667512.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.011 >= min_delta = 0.0. New best score: 6.657
Epoch 6, global step 3500: 'val_loss' reached 6.65674 (best 6.65674), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=6-val_loss=6.656736.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 4000: 'val_loss' reached 6.68064 (best 6.65674), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=7-val_loss=6.680641.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 6.656
Epoch 8, global step 4500: 'val_loss' reached 6.65571 (best 6.65571), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=8-val_loss=6.655707.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 6.643
Epoch 9, global step 5000: 'val_loss' reached 6.64322 (best 6.64322), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=9-val_loss=6.643221.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 6.637
Epoch 10, global step 5500: 'val_loss' reached 6.63727 (best 6.63727), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=10-val_loss=6.637270.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 6000: 'val_loss' reached 6.64345 (best 6.63727), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=11-val_loss=6.643453.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 6.636
Epoch 12, global step 6500: 'val_loss' reached 6.63616 (best 6.63616), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=12-val_loss=6.636163.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 7000: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14, global step 7500: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 15, global step 8000: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 6.617
Epoch 16, global step 8500: 'val_loss' reached 6.61659 (best 6.61659), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\simple_seq2seq_gru\\best-[ls_6-es_300-hs_1024]-epoch=16-val_loss=6.616592.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 17, global step 9000: 'val_loss' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 18, global step 9500: 'val_loss' was not in top 3

Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


С оптимайзером Adam модель не обучалась совсем, при изменениях learning rate происходил взрыв градиентов. С SGD и большим learning rate модель обучается очень медленно, поэтому процесс пришлось принудительно остановить.

In [43]:
simple_model = Seq2SeqGRU.load_from_checkpoint(
    "checkpoints/simple_seq2seq_gru/best-[ls_6-es_300-hs_1024]-epoch=16-val_loss=6.616592.ckpt",
    tokenizer=complete_tokenizer,
    num_layers=6,
    emb_dim=300,
    hidden_size=1024,
)

In [44]:
simple_model.translate("Мне пора идти спать.")

'he is to the the to the the the the the the to the the to the the to the the the the of the the the the the the of the the the the the the the of the the the the the the the of the the the the the the the the of the the the the the the the the of the the the the the the the the the of the the the the the the the the the of the the the the the the the the the the of the the the the the the the the the the of the the the the the the the the the the the of the the the the the the the the the'

Модель не удается обучить чтобы оценить какие либо результаты, она выдает самые повторяющиеся слова из английского языка которые встречаются почти во всех предложениях.

## + Attention

In [None]:
class DecoderWithAttention(nn.Module, DecoderInterface):
    def __init__(
        self,
        vocab_size: int,
        emb_dim: int,
        hidden_size: int,
        padding_idx: int,
        num_layers: int = 1,
        dropout: float = 0.0,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        # GRU теперь принимает emb_dim + hidden_size (эмбеддинг + контекст)
        self.gru = nn.GRU(
            input_size=emb_dim + hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
        )
        self.fc_out = nn.Linear(hidden_size + hidden_size + emb_dim, vocab_size)  # [dec_hid, context, emb]
        self.dropout = nn.Dropout(dropout)

        # Bahdanau attention
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(
        self,
        x: torch.Tensor,
        hidden: torch.Tensor,
        encoder_outputs: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        batch_size = x.shape[0]
        src_len = encoder_outputs.shape[1]

        embedded = self.dropout(self.embedding(x))  # [batch_size, emb_dim]

        # 2. Получаем последний слой скрытого состояния (для attention)
        # hidden: [num_layers, batch_size, hidden_size] → last_layer: [batch_size, hidden_size]
        last_hidden = hidden[-1]  # [batch_size, hidden_size]

        # 3. Вычисляем attention weights
        # Повторяем last_hidden для каждого шага энкодера
        hidden_expanded = last_hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hidden_size]

        # energy: [batch_size, src_len, hidden_size]
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))
        # v: [hidden_size] → [batch_size, 1, hidden_size]
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        # attention: [batch_size, src_len]
        attention = torch.bmm(v, energy.transpose(1, 2)).squeeze(1)
        attention_weights = attention.softmax(dim=1)  # [batch_size, src_len]

        # 4. Контекстный вектор — взвешенная сумма encoder_outputs
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # [batch_size, 1, hidden_size]

        # 5. Подготавливаем вход для GRU: [emb + context]
        gru_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, emb_dim + hidden_size]

        # 6. Пропускаем через GRU
        output, hidden = self.gru(gru_input, hidden)  # output: [batch_size, 1, hidden_size]

        # 7. Финальный прогноз
        output = output.squeeze(1)  # [batch_size, hidden_size]
        context = context.squeeze(1)  # [batch_size, hidden_size]
        embedded = embedded.squeeze(1)  # [batch_size, emb_dim]

        # Объединяем всё для более выразительного прогноза
        prediction = self.fc_out(torch.cat((output, context, embedded), dim=1))  # [batch_size, vocab_size]

        return prediction, hidden

In [29]:
s2sgru_attention = Seq2SeqGRU(complete_tokenizer, decoder_class=DecoderWithAttention)

In [31]:
with torch.no_grad():
    s2sgru_attention.training_step(batch, 0)
    s2sgru_attention.validation_step(batch, 0)
    s2sgru.on_validation_epoch_end()

c:\Users\dilst\Documents\uni_lessons_da_and_ml\.venv\Lib\site-packages\pytorch_lightning\core\module.py:449: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


In [35]:
torch.set_float32_matmul_precision("medium")
start_leaning(
    "attention_seq2seq_gru",
    4,
    300,
    512,
    dm,
    complete_tokenizer,
    DecoderWithAttention,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                 | Params | Mode 
---------------------------------------------------------
0 | encoder | Encoder              | 13.7 M | train
1 | decoder | DecoderWithAttention | 49.0 M | train
2 | loss_fn | CrossEntropyLoss     | 0      | train
---------------------------------------------------------
62.6 M    Trainable params
0         Non-trainable params
62.6 M    Total params
250.508   Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 7.750
Epoch 0, global step 500: 'val_loss' reached 7.74985 (best 7.74985), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=0-val_loss=7.749851.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.319 >= min_delta = 0.0. New best score: 7.430
Epoch 1, global step 1000: 'val_loss' reached 7.43040 (best 7.43040), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=1-val_loss=7.430398.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.382 >= min_delta = 0.0. New best score: 7.049
Epoch 2, global step 1500: 'val_loss' reached 7.04859 (best 7.04859), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=2-val_loss=7.048591.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.138 >= min_delta = 0.0. New best score: 6.910
Epoch 3, global step 2000: 'val_loss' reached 6.91032 (best 6.91032), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=3-val_loss=6.910320.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.106 >= min_delta = 0.0. New best score: 6.804
Epoch 4, global step 2500: 'val_loss' reached 6.80441 (best 6.80441), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=4-val_loss=6.804408.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 3000: 'val_loss' reached 6.80967 (best 6.80441), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=5-val_loss=6.809674.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.109 >= min_delta = 0.0. New best score: 6.695
Epoch 6, global step 3500: 'val_loss' reached 6.69534 (best 6.69534), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=6-val_loss=6.695343.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.066 >= min_delta = 0.0. New best score: 6.630
Epoch 7, global step 4000: 'val_loss' reached 6.62961 (best 6.62961), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=7-val_loss=6.629610.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 8, global step 4500: 'val_loss' reached 6.64730 (best 6.62961), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=8-val_loss=6.647296.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 5000: 'val_loss' reached 6.65900 (best 6.62961), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=9-val_loss=6.659003.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 5500: 'val_loss' reached 6.64994 (best 6.62961), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=10-val_loss=6.649936.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.031 >= min_delta = 0.0. New best score: 6.598
Epoch 11, global step 6000: 'val_loss' reached 6.59843 (best 6.59843), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=11-val_loss=6.598429.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.015 >= min_delta = 0.0. New best score: 6.583
Epoch 12, global step 6500: 'val_loss' reached 6.58307 (best 6.58307), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=12-val_loss=6.583066.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 6.580
Epoch 13, global step 7000: 'val_loss' reached 6.58006 (best 6.58006), saving model to 'C:\\Users\\dilst\\Documents\\uni_lessons_da_and_ml\\src\\torch_learning\\checkpoints\\attention_seq2seq_gru\\best-[ls_4-es_300-hs_512]-epoch=13-val_loss=6.580057.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss = nan is not finite. Previous best value was 6.580. Signaling Trainer to stop.
Epoch 14, global step 7500: 'val_loss' was not in top 3


In [41]:
attention_model = Seq2SeqGRU.load_from_checkpoint(
    "checkpoints/attention_seq2seq_gru/best-[ls_4-es_300-hs_512]-epoch=13-val_loss=6.580057.ckpt",
    tokenizer=complete_tokenizer,
    num_layers=4,
    emb_dim=300,
    hidden_size=512,
    decoder_class=DecoderWithAttention,
)

In [42]:
attention_model.translate("Мне пора идти спать.")

'i like the to the the the rain. day. time. time. time. time. time. time. time. time. time. time. time.ed. day.'

# Для тестового датасета сделайте перевод 30 фраз с одного языка на другой обеими моделями

In [65]:
def get_words_from_dataloader(dm: TextDataModule) -> tuple[list[str], list[str]]:
    dm.setup(stage="test")
    test_words_from = []
    test_words_to = []

    for i, words in enumerate(dm.test_dataloader()):
        seq_from, seq_to = words
        for i in range(seq_from.size(0)):
            test_words_from.append(complete_tokenizer.decode(seq_from[i].tolist()))
            test_words_to.append(complete_tokenizer.decode(seq_to[i].tolist()))
            if len(test_words_from) == 30:
                return test_words_from, test_words_to

    raise RuntimeError

In [67]:
from_s, to_s = get_words_from_dataloader(dm)

In [71]:
print("Обычная модель:")
for fw, tw in zip(from_s, to_s):
    result = simple_model.translate(fw)
    print("=" * 50)
    print(" " * 4 + f"Предложение: {fw}")
    print(" " * 4 + f"Результат предсказания: {result}")
    print(" " * 4 + f"Что должно было быть: {tw}")

Обычная модель:
    Предложение: я люблю виноград, но не могу съесть так много.
    Результат предсказания: he is to the the to the the the the the the to the the to the the to the the the the of the the the the the the of the the the the the the the of the the the the the the the of the the the the the the the the of the the the the the the the the of the the the the the the the the the of the the the the the the the the the of the the the the the the the the the the of the the the the the the the the the the of the the the the the the the the the the the of the the the the the the the the the
    Что должно было быть: i like grapes, but i can't eat so many.
    Предложение: горбатого могила исправит.
    Результат предсказания: he is to the the to the the the the the the to the the to the the to the the the the of the the the the the the of the the the the the the the of the the the the the the the of the the the the the the the the of the the the the the the the the of the the the t

In [72]:
print("Модель с вниманием:")
for fw, tw in zip(from_s, to_s):
    result = attention_model.translate(fw)
    print("=" * 50)
    print(" " * 4 + f"Предложение: {fw}")
    print(" " * 4 + f"Результат предсказания: {result}")
    print(" " * 4 + f"Что должно было быть: {tw}")

Модель с вниманием:
    Предложение: я люблю виноград, но не могу съесть так много.
    Результат предсказания: i don't know the to be but i the the of the of the of the of the of the the the the the the the the the the the the the the the the the the the the the the the the the the
    Что должно было быть: i like grapes, but i can't eat so many.
    Предложение: горбатого могила исправит.
    Результат предсказания: the is is the to the rain. day.e.al.s.
    Что должно было быть: only death cures all pain.
    Предложение: почему вы не пытаетесь произвести хорошее впечатление?
    Результат предсказания: the girl was very hard to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
    Что должно было быть: why don't you put your best foot forward?
    Предложение: могу я здесь припарковаться?
    Результат предсказания: i am very much. teacher. him. i the time. time. time. time. time. time. time. time. ti

# Сделайте вывод о качестве полученных моделей

Модели не выполняют свою задачу, медленно обучаются. Модель без внимания генерирует всегда одну последовательность. Модель с вниманием начала обучатся, но во время обучения что-то произошло, возможно взрыв градиентов, нужно было поставить клиппинг градиентов, также для понимания проблемы нужно профилирование обучения.