# импорты

In [1]:
import random
from datasets import load_dataset
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

import tqdm

torch.manual_seed(42)
np.random.seed(42)

# загрузка датасета

In [2]:
dataset = load_dataset("ag_news")
train_dataset = dataset["train"].select(range(10000)) 
test_dataset = dataset["test"].select(range(2000))    

print(f"Тренировочные данные: {len(train_dataset):,}")
print(f"Тестовые данные:  {len(test_dataset):,}")
print(f"Метки: 0=World, 1=Sports, 2=Business, 3=Sci/Tech")

Тренировочные данные: 10,000
Тестовые данные:  2,000
Метки: 0=World, 1=Sports, 2=Business, 3=Sci/Tech


# токенизация

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
MAX_LENGTH = 128 

# функция для предобработки
def function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LENGTH
    )

train_encodings = train_dataset.map(function, batched=True)
test_encodings = test_dataset.map(function, batched=True)

# подготовка к обучению

In [4]:
train_encodings.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_encodings.set_format("torch", columns=["input_ids", "attention_mask", "label"])

BATCH_SIZE = 16
train_loader = DataLoader(train_encodings, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_encodings, batch_size=BATCH_SIZE, shuffle=False)

print(f"   Train batches: {len(train_loader):,}")
print(f"   Test batches:  {len(test_loader):,}")

✅ DataLoaders ready:
   Train batches: 625
   Test batches:  125


# модель и выбор устройства для обучения

In [5]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 5

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# обучение

In [6]:
model.train()
for epoch in range(EPOCHS):
    print(f"\n EPOCH {epoch+1}/{EPOCHS}")
    total_loss = 0

    progress_bar = tqdm.tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Avg_loss: {avg_loss:.4f}")


 EPOCH 1/5


Training: 100%|█████████████████████████████████████████████████████████| 625/625 [03:02<00:00,  3.42it/s, loss=0.1204]


Epoch 1 Avg_loss: 0.3896

 EPOCH 2/5


Training: 100%|█████████████████████████████████████████████████████████| 625/625 [02:10<00:00,  4.80it/s, loss=0.1634]


Epoch 2 Avg_loss: 0.1689

 EPOCH 3/5


Training: 100%|█████████████████████████████████████████████████████████| 625/625 [02:12<00:00,  4.73it/s, loss=0.0105]


Epoch 3 Avg_loss: 0.1025

 EPOCH 4/5


Training: 100%|█████████████████████████████████████████████████████████| 625/625 [02:13<00:00,  4.68it/s, loss=0.0148]


Epoch 4 Avg_loss: 0.0696

 EPOCH 5/5


Training: 100%|█████████████████████████████████████████████████████████| 625/625 [02:10<00:00,  4.79it/s, loss=0.1700]

Epoch 5 Avg_loss: 0.0457





# Оценка модели

In [8]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    progress_bar = tqdm.tqdm(test_loader, desc="Testing")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Точность на тесте: {accuracy:.4f} ({accuracy*100:.2f}%)")

Testing: 100%|███████████████████████████████████████████████████████████████████████| 125/125 [00:07<00:00, 16.21it/s]

Точность на тесте: 0.9245 (92.45%)



