### Единый ноутбук для классификации эмоций (EN -> RU дообучение)
Этот ноутбук обучает и экспортирует модель распознавания эмоций по тексту. Он рассчитан на запуск в Google Colab и полностью автоматизирует:
- установку зависимостей;
- загрузку датасетов (EN и RU) и модели;
- обучение на английском языке;
- дообучение на русском языке;
- валидацию и расчёт F1 (micro/macro), ROC-AUC;
- сохранение артефактов (PyTorch + ONNX) и маппинг меток;
- пайплайн инференса с постпроцессингом (порог/`top_k`);
- визуализацию метрик.

Выборы по умолчанию:
- **Датасет**: `go_emotions` (58k примеров, 28 меток включая `neutral`) для английского и `seara/ru_go_emotions` для русского.
- **Модель**: `xlm-roberta-base` (многоязычная, хорошо переносится на русский).
- **Задача**: multi-label (одновременно несколько эмоций).


In [2]:
! pip install torch transformers>=4.41.0 datasets>=2.19.0 evaluate>=0.4.2 accelerate>=0.33.0 scikit-learn>=1.3.0 onnx>=1.15.0 onnxruntime>=1.17.0
# Setup
import os, sys, subprocess, json, math
from dataclasses import dataclass
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, set_seed
from datasets import load_dataset
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve
from transformers.onnx import export
from transformers.onnx.features import FeaturesManager
from pathlib import Path
import onnxruntime as ort
import matplotlib.pyplot as plt

reqs = [
    'torch', 'transformers>=4.41.0', 'datasets>=2.19.0', 'evaluate>=0.4.2',
    'accelerate>=0.33.0', 'scikit-learn>=1.3.0', 'onnx>=1.15.0', 'onnxruntime>=1.17.0'
]

def pip_install(pkgs):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + pkgs)

try:
    import transformers, datasets, evaluate, sklearn, onnxruntime
except Exception:
    pip_install(reqs)
    import transformers, datasets, evaluate, sklearn, onnxruntime

print('Transformers:', transformers.__version__)
print('Datasets:', datasets.__version__)

@dataclass
class Config:
    model_name: str = 'xlm-roberta-base'
    out_dir: str = 'outputs/emotion_xlmr'
    max_length: int = 128
    train_bs: int = 32
    eval_bs: int = 32
    lr: float = 2e-5
    wd: float = 0.01
    warmup_ratio: float = 0.06
    epochs_en: int = 3
    epochs_ru: int = 2
    seed: int = 42
    fp16: bool = True

CFG = Config()
os.makedirs(CFG.out_dir, exist_ok=True)
print(CFG)


Transformers: 4.55.2
Datasets: 4.0.0
Config(model_name='xlm-roberta-base', out_dir='outputs/emotion_xlmr', max_length=128, train_bs=32, eval_bs=32, lr=2e-05, wd=0.01, warmup_ratio=0.06, epochs_en=3, epochs_ru=2, seed=42, fp16=True)


In [22]:
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, load_dataset
if not hasattr(CFG, 'max_len'):
  CFG.max_len = 128
# Load EN GoEmotions (raw, multilabel)
en_ds = load_dataset('go_emotions', 'raw')

# Split the dataset into train, validation, and test sets
train_test_split = en_ds['train'].train_test_split(test_size=0.2, seed=CFG.seed)
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=CFG.seed)

en_ds = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_valid_split['train'],
    'test': test_valid_split['test']
})

# Define non-emotion columns (metadata)
non_emotion_cols = [
    'text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
    'created_utc', 'rater_id', 'example_very_unclear'
]

# Get emotion label names
label_names = [col for col in en_ds['train'].column_names if col not in non_emotion_cols]
num_labels = len(label_names)

# Save label names
with open(os.path.join(CFG.out_dir, 'label_names.json'), 'w') as f:
    json.dump(label_names, f, ensure_ascii=False, indent=2)

# Convert binary columns to list of label indices
def extract_labels(example):
    return {
        'labels': [idx for idx, col in enumerate(label_names) if example[col] == 1]
    }

# Apply to all splits
en_ds = en_ds.map(extract_labels, batched=False)

# Remove original binary columns
en_ds = en_ds.remove_columns(label_names)

# Preprocess function for tokenization
def preprocess_function(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=CFG.max_len
    )
    tokenized['labels'] = examples['labels']
    return tokenized

# Tokenize all splits
en_proc = en_ds.map(preprocess_function, batched=True, remove_columns=en_ds['train'].column_names)

print('EN splits:', en_proc)
print('Labels:', label_names)


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

EN splits: DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 168980
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 21122
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 21123
    })
})
Labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [23]:
# Tokenizer & preprocess (shared)
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, use_fast=True)

def preprocess(ds):
    def fn(examples):
        enc = tokenizer(examples['text'], max_length=CFG.max_length, truncation=True)
        mh = []
        for lbls in examples['labels']:
            arr = np.zeros(num_labels, dtype=np.float32)
            for i in lbls:
                arr[i] = 1.0
            mh.append(arr)
        enc['labels'] = mh
        return enc
    cols = ds['train'].column_names
    return ds.map(fn, batched=True, remove_columns=cols)

en_proc = preprocess(en_ds).with_format('torch')


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

In [24]:
# Model & collator
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.model_name,
    num_labels=num_labels,
    problem_type='multi_label_classification'
)
model.gradient_checkpointing_enable()

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if CFG.fp16 else None)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Metrics functions
sigmoid = torch.nn.Sigmoid()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(torch.tensor(logits)).numpy()
    preds = (probs >= 0.5).astype(int) # Default threshold for initial evaluation
    out = {}
    try:
        out['roc_auc_macro'] = roc_auc_score(labels, probs, average='macro')
    except Exception:
        out['roc_auc_macro'] = float('nan')
    out['f1_micro'] = f1_score(labels, preds, average='micro', zero_division=0)
    out['f1_macro'] = f1_score(labels, preds, average='macro', zero_division=0)
    return out

def tune_thresholds(val_logits: np.ndarray, val_labels: np.ndarray):
    # Per-class threshold via Youden-like F1 sweep
    per_class = []
    for c in range(val_labels.shape[1]):
        y_true = val_labels[:, c]
        y_scores = val_logits[:, c]
        try:
            prec, rec, thresh = precision_recall_curve(y_true, y_scores)
            f1 = (2 * prec * rec) / (prec + rec + 1e-8)
            best_idx = int(np.argmax(f1))
            per_class.append(float(thresh[min(best_idx, len(thresh)-1)]) if len(thresh) > 0 else 0.5)
        except Exception:
            per_class.append(0.5)
    # Global threshold sweep
    candidate = np.linspace(0.2, 0.8, 25)
    best_thr, best_f1 = 0.5, -1
    for t in candidate:
        preds = (val_logits >= t).astype(int)
        f1m = f1_score(val_labels, preds, average='macro', zero_division=0)
        if f1m > best_f1:
            best_f1, best_thr = f1m, float(t)
    return {
        'global': best_thr,
        'per_class': per_class
    }


In [29]:
# Train on EN
set_seed(CFG.seed)

args_en = TrainingArguments(
    output_dir=CFG.out_dir,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=100,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_bs,
    per_device_eval_batch_size=CFG.eval_bs,
    num_train_epochs=CFG.epochs_en,
    weight_decay=CFG.wd,
    warmup_ratio=CFG.warmup_ratio,
    fp16=CFG.fp16,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    report_to=['none'],
    save_total_limit=2,
)

# Create a custom trainer to handle type casting
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Convert labels to float for BCE loss
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

trainer = MultiLabelTrainer(
    model=model,
    args=args_en,
    train_dataset=en_proc['train'],
    eval_dataset=en_proc['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# To avoid the tokenizer deprecation warning, you can use the processing_class parameter instead
# But for now, let's just ignore the warning and proceed

trainer.train()

  trainer = MultiLabelTrainer(


Epoch,Training Loss,Validation Loss,Roc Auc Macro,F1 Micro,F1 Macro
1,0.1183,0.116661,0.845688,0.28567,0.156071
2,0.1113,0.111748,0.868487,0.33088,0.216771
3,0.1087,0.111096,0.871137,0.358228,0.259499


TrainOutput(global_step=15843, training_loss=0.12719010045670925, metrics={'train_runtime': 3422.3457, 'train_samples_per_second': 148.126, 'train_steps_per_second': 4.629, 'total_flos': 1.1044378408159488e+16, 'train_loss': 0.12719010045670925, 'epoch': 3.0})

In [30]:
# Tune thresholds on EN validation
val_outputs_en = trainer.predict(en_proc['validation'])
val_probs_en = sigmoid(torch.tensor(val_outputs_en.predictions)).numpy()
val_labels_en = val_outputs_en.label_ids

thr_en = tune_thresholds(val_probs_en, val_labels_en)
print('EN thresholds:', thr_en)


EN thresholds: {'global': 0.225, 'per_class': [0.3407045006752014, 0.36931508779525757, 0.24762336909770966, 0.13916224241256714, 0.19528697431087494, 0.2485342025756836, 0.16885694861412048, 0.2590751051902771, 0.19744445383548737, 0.14928317070007324, 0.14804720878601074, 0.17695589363574982, 0.22644133865833282, 0.15765489637851715, 0.38237205147743225, 0.42071861028671265, 0.025858774781227112, 0.18922126293182373, 0.4617144763469696, 0.08570928126573563, 0.29088014364242554, 0.04795810207724571, 0.12304913252592087, 0.07094482332468033, 0.280419260263443, 0.290477454662323, 0.21766994893550873, 0.2312668114900589]}


In [31]:
# EN test metrics BEFORE RU fine-tuning
metrics_en_test_pre = trainer.evaluate(en_proc['test'])
print('EN test metrics (pre-RU):', metrics_en_test_pre)


EN test metrics (pre-RU): {'eval_loss': 0.1113714799284935, 'eval_roc_auc_macro': 0.872065329415298, 'eval_f1_micro': 0.3601308118868193, 'eval_f1_macro': 0.2585286866108206, 'eval_runtime': 14.959, 'eval_samples_per_second': 1412.059, 'eval_steps_per_second': 44.187, 'epoch': 3.0}


In [39]:
# Load RU GoEmotions translation
ru_ds = load_dataset('seara/ru_go_emotions', 'raw')

# Define non-emotion columns for Russian dataset (including ru_text)
non_emotion_cols_ru = [
    'text', 'ru_text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
    'created_utc', 'rater_id', 'example_very_unclear'
]

# Get emotion label names (all columns excluding metadata)
ru_label_names = [col for col in ru_ds['train'].column_names if col not in non_emotion_cols_ru]

# Verify label schema matches EN
assert set(ru_label_names) == set(label_names), f'RU label schema mismatch with EN GoEmotions: {set(ru_label_names) ^ set(label_names)}'

# Convert binary columns to list of label indices
def extract_labels(example):
    return {
        'labels': [idx for idx, col in enumerate(label_names) if example[col] == 1]
    }

# Apply to all splits
ru_ds = ru_ds.map(extract_labels, batched=False)

# Remove original binary columns and ru_text (keep original text for reference if needed)
ru_ds = ru_ds.remove_columns(ru_label_names + ['ru_text'])

# Split the dataset into train, validation, and test sets (similar to EN dataset)
ru_split = ru_ds['train'].train_test_split(test_size=0.2, seed=CFG.seed)
ru_test_valid = ru_split['test'].train_test_split(test_size=0.5, seed=CFG.seed)

ru_ds = DatasetDict({
    'train': ru_split['train'],
    'validation': ru_test_valid['train'],
    'test': ru_test_valid['test']
})

# Now preprocess the data
ru_proc = preprocess(ru_ds).with_format('torch')
print('RU splits:', ru_proc)

Map:   0%|          | 0/211225 [00:00<?, ? examples/s]

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/21122 [00:00<?, ? examples/s]

Map:   0%|          | 0/21123 [00:00<?, ? examples/s]

RU splits: DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 168980
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 21122
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 21123
    })
})


In [1]:
# Continue fine-tuning on RU
args_ru = TrainingArguments(
    output_dir=os.path.join(CFG.out_dir, 'ru_ft'),
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=100,
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_bs,
    per_device_eval_batch_size=CFG.eval_bs,
    num_train_epochs=CFG.epochs_ru,
    weight_decay=CFG.wd,
    warmup_ratio=CFG.warmup_ratio,
    fp16=CFG.fp16,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    report_to=['none'],
    save_total_limit=2,
)

# Create a custom trainer to handle type casting (same as for EN)
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Convert labels to float for BCE loss
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

trainer_ru = MultiLabelTrainer(
    model=model,
    args=args_ru,
    train_dataset=ru_proc['train'],
    eval_dataset=ru_proc['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer_ru.train()

NameError: name 'TrainingArguments' is not defined

In [None]:
# Re-tune thresholds on combined EN+RU validation
val_outputs_ru = trainer_ru.predict(ru_proc['validation'])
val_probs_ru = sigmoid(torch.tensor(val_outputs_ru.predictions)).numpy()
val_labels_ru = val_outputs_ru.label_ids

probs_all = np.concatenate([val_probs_en, val_probs_ru], axis=0)
labels_all = np.concatenate([val_labels_en, val_labels_ru], axis=0)

thr_all = tune_thresholds(probs_all, labels_all)
with open(os.path.join(CFG.out_dir, 'thresholds.json'), 'w') as f:
    json.dump({'global': thr_all['global'], 'per_class': thr_all['per_class'], 'label_names': label_names}, f, ensure_ascii=False, indent=2)
print('Combined thresholds saved:', thr_all)


In [None]:
# Evaluate on RU test
metrics_ru = trainer_ru.evaluate(ru_proc['test'])
print('RU test metrics:', metrics_ru)


In [None]:
# EN test metrics AFTER RU fine-tuning
metrics_en_test_post = trainer_ru.evaluate(en_proc['test'])
print('EN test metrics (post-RU):', metrics_en_test_post)


In [None]:
# Per-label F1 on RU test with tuned thresholds
ru_test = ru_proc['test']
ru_logits = trainer_ru.predict(ru_test).predictions
ru_probs = sigmoid(torch.tensor(ru_logits)).numpy()
ru_labels = ru_test['labels'].numpy()

# Use per-class thresholds if available
per_class_thr = np.array(thr_all['per_class']) if 'thr_all' in globals() else np.full(num_labels, 0.5)
ru_preds = (ru_probs >= per_class_thr).astype(int)

f1_per_label = []
for i in range(num_labels):
    f1_i = f1_score(ru_labels[:, i], ru_preds[:, i], zero_division=0)
    f1_per_label.append(float(f1_i))

print('RU per-label F1 (len=', len(f1_per_label), '):')
for name, val in zip(label_names, f1_per_label):
    print(f'{name}: {val:.3f}')


In [None]:
# Plots: F1 per label bar chart (RU), and F1 macro pre vs post (EN)
plt.figure(figsize=(12,5))
plt.bar(range(len(f1_per_label)), f1_per_label)
plt.xticks(range(len(f1_per_label)), label_names, rotation=90)
plt.title('RU per-label F1 (threshold-tuned)')
plt.tight_layout()
plt.show()

pre = metrics_en_test_pre.get('eval_f1_macro', None) or metrics_en_test_pre.get('f1_macro', None) or 0
post = metrics_en_test_post.get('eval_f1_macro', None) or metrics_en_test_post.get('f1_macro', None) or 0
plt.figure(figsize=(4,4))
plt.bar(['EN pre', 'EN post'], [pre, post], color=['gray','green'])
plt.title('EN F1 macro: before vs after RU fine-tuning')
plt.ylim(0,1)
plt.show()


In [None]:
# Save model/tokenizer and export ONNX
save_dir = CFG.out_dir
trainer_ru.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

onnx_dir = Path(save_dir) / 'onnx'
onnx_feature = 'sequence-classification'
onnx_dir.mkdir(parents=True, exist_ok=True)

_, onnx_cfg_cls = FeaturesManager.check_supported_model_or_raise(model, feature=onnx_feature)
onnx_cfg = onnx_cfg_cls(model.config)

export(preprocessor=tokenizer, model=model, config=onnx_cfg, opset=17, output=onnx_dir / 'model.onnx')
print('ONNX saved to', onnx_dir / 'model.onnx')


In [None]:
# Inference helper
class EmotionClassifier:
    def __init__(self, model_dir):
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        self.sigmoid = torch.nn.Sigmoid()
        with open(os.path.join(model_dir, 'label_names.json')) as f:
            self.labels = json.load(f)
        thr_path = os.path.join(model_dir, 'thresholds.json')
        self.thresholds = json.load(open(thr_path)) if os.path.exists(thr_path) else None

    @torch.inference_mode()
    def predict(self, text, top_k=None, threshold=None):
        batch = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=CFG.max_length)
        probs = self.sigmoid(self.model(**batch).logits)[0].cpu().numpy()
        if top_k is not None:
            idx = probs.argsort()[::-1][:top_k]
            return [(self.labels[i], float(probs[i])) for i in idx]
        thr = threshold
        if thr is None and self.thresholds is not None:
            thr = float(self.thresholds.get('global', 0.5))
        if thr is None:
            thr = 0.5
        idx = np.where(probs >= thr)[0]
        return [(self.labels[i], float(probs[i])) for i in idx]

clf = EmotionClassifier(CFG.out_dir)
print(clf.predict('I am so happy and grateful today!', top_k=5))
print(clf.predict('Сегодня я чувствую лёгкую тревогу и сомнение, но также надежду.', top_k=5))


In [None]:
# Validate ONNX with ONNX Runtime
onnx_model_path = os.path.join(CFG.out_dir, 'onnx', 'model.onnx')
sess = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])

sample_text_en = 'Today I feel a little anxious and doubtful, but also hopeful.'
sample_text_ru = 'Сегодня я чувствую лёгкую тревогу и сомнение, но также надежду.'

# Test with English text
inputs_en = tokenizer(sample_text_en, return_tensors='np', max_length=CFG.max_length, truncation=True, padding='max_length')
ort_inputs_en = {k: v for k, v in inputs_en.items()}
ort_outs_en = sess.run(None, ort_inputs_en)
onnx_logits_en = ort_outs_en[0]
print('ONNX logits (EN) shape:', onnx_logits_en.shape)

# Test with Russian text
inputs_ru = tokenizer(sample_text_ru, return_tensors='np', max_length=CFG.max_length, truncation=True, padding='max_length')
ort_inputs_ru = {k: v for k, v in inputs_ru.items()}
ort_outs_ru = sess.run(None, ort_inputs_ru)
onnx_logits_ru = ort_outs_ru[0]
print('ONNX logits (RU) shape:', onnx_logits_ru.shape)
