In [8]:
pip install kagglehub torch


Collecting torch
  Downloading torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Downloading torch-2.9.1-cp313-cp313-win_amd64.whl (110.9 MB)
   ---------------------------------------- 0.0/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/110.9 MB ? eta -:--:--
    --------------------------------------- 1.6/110.9 MB 7.1 MB/s eta 0:00:16
   - -------------------------------------- 3.4/110.9 MB 7.8 MB/s eta 0:00:14
   -- ------------------------------------- 6.3/110.9 MB 9.2 MB/s eta 0:00:12
   --- ------------------------------------ 8.7/110.9 MB 9.7 MB/s eta 0:00:11
   --- ------------------------------------ 11.0/110.9 MB 10.0 MB/s eta 0:00:10
   ---- ----------------------------------- 13.4/110.9 MB 10.2 MB/s eta 0:00:10
   ----- ---------------------------------- 15.7/110.9 MB 10.4 MB/s eta 0:00:10
   ------ --------------------------------- 18.1/110.9 MB 10.5 MB/s eta 0:00:09
   ------- -------------------------------- 20.4/110.9 MB 10.6 MB/s eta 0:00:0

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mar1mba/russian-sentiment-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mar1mba/russian-sentiment-dataset?dataset_version_number=2...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 53.9M/53.9M [00:05<00:00, 9.78MB/s]

Extracting files...





Path to dataset files: C:\Users\balot\.cache\kagglehub\datasets\mar1mba\russian-sentiment-dataset\versions\2


In [39]:
import argparse
import os
import random
import re
import json
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


# Set random seed to reproduct experimentations
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [40]:
def simple_tokenize(text):
    """
    Токенизируем текст. Убрать знаки, в один регистр и другая шляпа.
    """
    text = str(text).lower()
    text = re.sub(r'[^0-9a-zа-яё\s]', ' ', text)
    tokens = text.split()
    return tokens

def build_vocab(tokenized_texts, max_vocab_size=20000, min_freq=1):
    counter = Counter([tok for doc in tokenized_texts for tok in doc])  # в токенизированном тексте выбираем фрагменты doc и берем по токену
    # убираем те что меньше минимальной частоты и сортируем по убыванию
    if min_freq > 1:
        items = [(w,c) for w,c in counter.items() if c >= min_freq]
        items.sort(key=lambda x: x[1], reverse=True)
    else:
        items = counter.most_common()
    items = items[:max_vocab_size-2]  # убираем последние чтобы поместить служебные токены
    itos = ['<PAD>', '<UNK>'] + [w for w,c in items]  # берём слова из словаря и помещаем к ним служебные токены
    stoi = {w:i for i,w in enumerate(itos)}  # слово - индекс словарь
    return stoi, itos

def encode_doc(tokens, stoi, max_len):
    """
    Кодируем документ из словаря stoi
    """
    seq = [stoi.get(t, 1) for t in tokens]  # Возращаем индекс, если есть в словаре. Иначе 1 --> <UNK>
    if len(seq) >= max_len:  # режем слишком длинные последовательности
        return seq[:max_len]
    else:
        return seq + [0]*(max_len - len(seq))  # дополняем нулями короткие. 0 --> <PAD>

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.X = torch.tensor(sequences, dtype=torch.long)
        self.y = torch.tensor(labels, dtype=torch.long)  
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [41]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128, bidir=False, num_classes=3):  # num_classes=3 для трёхклассовой классификации
        super().__init__()
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)  # слой эмбедингов
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden,
            bidirectional=bidir,  # используем параметр bidir
            batch_first=True
        )
        self.hidden_size = hidden * (2 if bidir else 1)  # скрытый слой
        self.fc = nn.Linear(self.hidden_size, 128)  # промежуточный линейный слой преобразует размер входного вектора в 128
        self.dropout = nn.Dropout(0.4)  # Выклюлючает 40% всех входов случайно. Нужен для предотвращения переобучения
        self.out = nn.Linear(128, num_classes)  # финальный слой предсказания - 3 класса
    
    def forward(self, x):
        e = self.embed(x)
        _, (h_n, _) = self.lstm(e)  # нам нужны только финальные состояния слоёв (причём только скрытые), промежуточные в топку
        if self.lstm.bidirectional:
            h = torch.cat((h_n[-2], h_n[-1]), dim=1)  # конкатенируем forward и backward
        else:
            h = h_n[-1]  # последний слой
        h = torch.relu(self.fc(h))  # линейно выравниваем и используем функцию активации relu для каждого ветора
        h = self.dropout(h)  # to be Tanos
        return self.out(h)

In [42]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, device):
    model.eval()  # Переводим модель в режим оценки (отключаем dropout, batch norm и т.д.)
    ys = []
    yps = []
    with torch.no_grad():  # Отключаем вычисление градиентов для ускорения и экономии памяти
        for xb, yb in loader:  # Проходим по всем батчам. Распаковываем батч в xb - признаки, yb - метки
            xb = xb.to(device)
            out = model(xb).cpu().numpy()
            yps.extend(out.tolist())
            ys.extend(yb.numpy().tolist())
    
    y_true = np.array(ys)
    y_prob = np.array(yps)
    y_pred = np.argmax(y_prob, axis=1)  # для многоклассовой классификации используем argmax
    
    acc = accuracy_score(y_true, y_pred)
    # для 3 классов используем macro averaging для получения усредненных метрик по всем классам
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'cm': cm,
        'y_true': y_true,
        'y_pred': y_pred,
        'y_prob': y_prob
    }

# Основной пайплайн

In [45]:
def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)
    
    if not os.path.exists(args.data_path):
        raise FileNotFoundError(f"Data file not found: {args.data_path}")
    
    df = pd.read_csv(args.data_path)
    if 'text' not in df.columns or 'label' not in df.columns:
        raise ValueError("CSV must contain 'text' and 'label' columns.")
    
    df = df.dropna(subset=['text']).reset_index(drop=True)
    texts = df['text'].astype(str).tolist()
    labels = df['label'].astype(int).values
    
    # Проверяем что метки находятся в диапазоне [0, 1, 2] для 3 классов
    unique_labels = np.unique(labels)
    print(f"Unique labels in dataset: {unique_labels}")
    if not all(label in [0, 1, 2] for label in unique_labels):
        print("Warning: Labels should be 0, 1, 2 for 3-class classification")
    
    tokenized = [simple_tokenize(t) for t in texts]
    stoi, itos = build_vocab(tokenized, max_vocab_size=args.max_vocab, min_freq=args.min_freq)
    vocab_size = len(itos)
    print("Vocab size:", vocab_size)
    
    sequences = np.array([encode_doc(toks, stoi, args.max_len) for toks in tokenized], dtype=np.int64)
    
    X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=args.test_size, random_state=args.seed, stratify=labels)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=args.val_size, random_state=args.seed, stratify=y_train)
    
    print(f"Train: {len(X_tr)}, Val: {len(X_val)}, Test: {len(X_test)}")
    
    # Выводим распределение классов
    print(f"Train distribution: {np.bincount(y_tr)}")
    print(f"Val distribution: {np.bincount(y_val)}")
    print(f"Test distribution: {np.bincount(y_test)}")
    
    train_loader = DataLoader(TextDataset(X_tr, y_tr), batch_size=args.batch_size, shuffle=True)
    val_loader = DataLoader(TextDataset(X_val, y_val), batch_size=args.batch_size, shuffle=False)
    test_loader = DataLoader(TextDataset(X_test, y_test), batch_size=args.batch_size, shuffle=False)
    
    model = LSTMModel(vocab_size, emb_dim=args.emb_dim, hidden=args.lstm_hidden, bidir=args.bidir, num_classes=3).to(device)  # num_classes=3
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    
    best_val_f1 = -1.0
    epochs_without_improve = 0
    history = {'train_loss': [], 'val_f1': [], 'val_acc': []}
    
    for epoch in range(1, args.epochs + 1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_metrics = evaluate(model, val_loader, device)
        val_f1 = val_metrics['f1']
        val_acc = val_metrics['accuracy']
        
        history['train_loss'].append(train_loss)
        history['val_f1'].append(val_f1)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch}/{args.epochs} — train_loss: {train_loss:.4f} | val_acc: {val_acc:.4f} | val_f1: {val_f1:.4f}")
        
        if val_f1 > best_val_f1:  # лучшая модель
            best_val_f1 = val_f1
            epochs_without_improve = 0
            best_state = {
                'model_state': model.state_dict(),
                'optimizer_state': optimizer.state_dict(),
                'args': vars(args)
            }
            torch.save(best_state, args.output_model_path)
            print(f"  Saved best model (val_f1={val_f1:.4f}) -> {args.output_model_path}")
        else:
            epochs_without_improve += 1
            if epochs_without_improve >= args.patience:
                print(f"Early stopping: no improvement for {args.patience} epochs.")
                break
    
    saved = torch.load(args.output_model_path, map_location=device, weights_only=False)
    model.load_state_dict(saved['model_state'])
    
    test_metrics = evaluate(model, test_loader, device)
    print("\nTest results:")
    print(f"  Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {test_metrics['precision']:.4f}")
    print(f"  Recall (macro): {test_metrics['recall']:.4f}")
    print(f"  F1 (macro): {test_metrics['f1']:.4f}")
    print("  Confusion matrix:\n", test_metrics['cm'])
    print("\nClassification report:\n", classification_report(test_metrics['y_true'], test_metrics['y_pred'], target_names=['class_0','class_1','class_2'], zero_division=0))
    
    # Сохраняем результаты в JSON
    results = {
        'accuracy': float(test_metrics['accuracy']),
        'precision': float(test_metrics['precision']),
        'recall': float(test_metrics['recall']),
        'f1': float(test_metrics['f1']),
        'confusion_matrix': test_metrics['cm'].tolist(),
        'history': history
    }
    with open(args.results_path, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"\nResults saved to {args.results_path}")

class Args:
    def __init__(self):
        self.data_path = f'{path}\sentiment_dataset.csv'
        self.max_vocab = 20000
        self.min_freq = 1
        self.max_len = 80
        self.emb_dim = 128
        self.lstm_hidden = 128
        self.bidir = False
        self.epochs = 10
        self.batch_size = 64
        self.lr = 1e-3
        self.test_size = 0.2
        self.val_size = 0.125
        self.patience = 3
        self.seed = 42
        self.output_model_path = 'best_model.pth'
        self.results_path = 'results.json'

if __name__ == "__main__":
    args = Args()
    main(args)

Device: cpu
Unique labels in dataset: [0 1 2]
Vocab size: 20000
Train: 203320, Val: 29046, Test: 58092
Train distribution: [67612 67814 67894]
Val distribution: [9659 9688 9699]
Test distribution: [19318 19375 19399]
Epoch 1/10 — train_loss: 0.9895 | val_acc: 0.6230 | val_f1: 0.6124
  Saved best model (val_f1=0.6124) -> best_model.pth
Epoch 2/10 — train_loss: 0.7302 | val_acc: 0.6700 | val_f1: 0.6726
  Saved best model (val_f1=0.6726) -> best_model.pth
Epoch 3/10 — train_loss: 0.6576 | val_acc: 0.6896 | val_f1: 0.6908
  Saved best model (val_f1=0.6908) -> best_model.pth
Epoch 4/10 — train_loss: 0.6017 | val_acc: 0.6900 | val_f1: 0.6908
  Saved best model (val_f1=0.6908) -> best_model.pth
Epoch 5/10 — train_loss: 0.5442 | val_acc: 0.6900 | val_f1: 0.6910
  Saved best model (val_f1=0.6910) -> best_model.pth
Epoch 6/10 — train_loss: 0.4847 | val_acc: 0.6876 | val_f1: 0.6889
Epoch 7/10 — train_loss: 0.4217 | val_acc: 0.6830 | val_f1: 0.6840
Epoch 8/10 — train_loss: 0.3636 | val_acc: 0.6768