In [1]:
pip install torch datasets transformers scikit-learn





[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


# Загрузка данных

In [2]:


import random
import torch
from datasets import load_dataset

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)

# AG News — быстрый, лёгкий, подходит под future image-модальность (topic → image class)
ds = load_dataset("ag_news")
print(ds)

print("train size:", len(ds["train"]))
print("test size:", len(ds["test"]))

print("\nSample:")
print(ds["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


DEVICE: cpu
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
train size: 120000
test size: 7600

Sample:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


Описание датасета (AG News)

In [None]:
NUM_CLASSES = 4
LABEL_NAMES = ["World", "Sports", "Business", "Sci/Tech"]

Токенизация и DataLoader

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

MAX_LEN = 128
BATCH_SIZE = 32

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

ds_tok = ds.map(tokenize, batched=True)
ds_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_loader = DataLoader(ds_tok["train"], batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(ds_tok["test"], batch_size=BATCH_SIZE)

Map: 100%|██████████| 7600/7600 [00:00<00:00, 7805.16 examples/s]


PositionalEncoding (cosine)

In [None]:

import torch
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

MultiHeadAttention (FIXED)

In [None]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.h = num_heads
        self.d = d_model // num_heads

        self.qkv = nn.Linear(d_model, d_model * 3)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        B, T, C = x.shape

        qkv = self.qkv(x).reshape(B, T, 3, self.h, self.d)
        qkv = qkv.permute(2, 0, 3, 1, 4)   # [3, B, h, T, d]
        q, k, v = qkv[0], qkv[1], qkv[2]

        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d)
        # scores: [B, h, T, T]

        if mask is not None:
            # mask: [B, T] -> [B, 1, 1, T]
            mask = mask[:, None, None, :]
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = scores.softmax(dim=-1)
        out = (attn @ v)                  # [B, h, T, d]
        out = out.transpose(1, 2).reshape(B, T, C)

        return self.out(out)

TransformerEncoderLayer

In [None]:

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff = nn.Sequential(
            nn.Linear(d_model, dim_ff),
            nn.ReLU(),
            nn.Linear(dim_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        x = self.norm1(x + self.dropout(self.attn(x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x

TransformerEncoder

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dim_ff):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(d_model, num_heads, dim_ff)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

TransformerClassifier

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, dim_ff, num_classes):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        self.encoder = TransformerEncoder(num_layers, d_model, num_heads, dim_ff)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, mask):
        x = self.embed(input_ids)
        x = self.pos(x)
        x = self.encoder(x, mask)
        x = x.mean(dim=1)
        return self.fc(x)

Наконец обучение

In [None]:
model = TransformerClassifier(
    vocab_size=tokenizer.vocab_size,
    d_model=128,
    num_heads=4,
    num_layers=2,
    dim_ff=256,
    num_classes=NUM_CLASSES
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

Train loop

In [None]:

def train_epoch(loader):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        mask = batch["attention_mask"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        logits = model(input_ids, mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

Evaluation

In [None]:
from sklearn.metrics import accuracy_score

def eval_model(loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(DEVICE)
            mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            logits = model(input_ids, mask)
            preds.extend(logits.argmax(dim=1).cpu().tolist())
            targets.extend(labels.cpu().tolist())
    return accuracy_score(targets, preds)

Запуск

In [None]:

EPOCHS = 3
for epoch in range(EPOCHS):
    loss = train_epoch(train_loader)
    acc = eval_model(test_loader)
    print(f"Epoch {epoch+1}: loss={loss:.4f}, acc={acc:.4f}")

Epoch 1: loss=0.4814, acc=0.8789


Epoch 1: loss=0.2952, acc=0.8841


Epoch 1: loss=0.2712, acc=0.8914
