# 1. Детерминизм + путь к проекту

In [1]:
#!/usr/bin/env python
# coding: utf-8

# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

# подключаем src проекта
import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


# 2. Загружаем SST-2

In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

# 3. Токенизация полного train


In [3]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False
    )
    enc["label"] = batch["label"]
    return enc

train_tok_full = train_raw.map(
    tokenize_batch,
    batched=True,
    remove_columns=train_raw.column_names
)
val_tok = val_raw.map(
    tokenize_batch,
    batched=True,
    remove_columns=val_raw.column_names
)

len(train_tok_full), len(val_tok)


(67349, 872)

# 4. Формируем Random 10% для базовой модели


In [4]:
train_10 = train_raw.shuffle(seed=42).select(range(int(0.1 * len(train_raw))))
train_tok_10 = train_10.map(
    tokenize_batch,
    batched=True,
    remove_columns=train_raw.column_names
)

len(train_tok_10)


6734

# 5. Обучаем базовую модель на Random 10%


In [5]:
from train_utils import train_model

base_model, base_metrics = train_model(
    model_name=model_name,
    train_dataset=train_tok_10,
    val_dataset=val_tok,
    epochs=2,        # 2 эпохи — быстро и достаточно для confidence/loss
    lr=2e-5,
    batch_size=32
)

base_metrics


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.3929419008511785
{'val_loss': 0.22405235016984598, 'accuracy': 0.9059633027522935, 'f1': 0.9076576576576577}

Epoch 2
train_loss: 0.16062389586950648
{'val_loss': 0.22300875539492285, 'accuracy': 0.9208715596330275, 'f1': 0.9198606271777003}


{'val_loss': 0.22300875539492285,
 'accuracy': 0.9208715596330275,
 'f1': 0.9198606271777003}

# 6. DataLoader для скоринга (используем tokenizer.pad)


In [6]:
import torch
from torch.utils.data import DataLoader

def make_loader_for_scoring(dataset, batch_size=64):
    def collate_fn(batch):
        enc = tokenizer.pad(
            {
                "input_ids": [x["input_ids"] for x in batch],
                "attention_mask": [x["attention_mask"] for x in batch],
            },
            return_tensors="pt",
        )
        labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)
        enc["labels"] = labels
        return enc
    
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn
    )

scoring_loader = make_loader_for_scoring(train_tok_full, batch_size=64)


# 7. Считаем p_gold (и/или loss) для каждого примера


In [7]:
import torch.nn.functional as F

p_gold_list = []   # (idx, p_gold)

base_model.eval()
base_model.to("cuda")

with torch.no_grad():
    global_idx = 0
    for batch in scoring_loader:
        batch = {k: v.to("cuda") for k, v in batch.items()}

        logits = base_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        ).logits

        probs = F.softmax(logits, dim=-1)
        labels = batch["labels"]

        for i in range(len(labels)):
            y = labels[i].item()
            p = probs[i, y].item()  # вероятность истинного класса
            p_gold_list.append((global_idx, p))
            global_idx += 1

len(p_gold_list)


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


67349

# 8. Берём bottom-10% по p_gold


In [8]:
# сортируем по уверенности ПО ВОЗРАСТАНИЮ
p_gold_sorted = sorted(p_gold_list, key=lambda x: x[1])  # низкая уверенность → в начале

M = int(0.1 * len(p_gold_sorted))
low_conf_indices = [idx for idx, _ in p_gold_sorted[:M]]

len(low_conf_indices), low_conf_indices[:10]


(6734, [12086, 23459, 41765, 37621, 46803, 56987, 33677, 13418, 25401, 7250])

# 9. Собираем low-confidence subset


In [9]:
train_tok_lowconf = train_tok_full.select(low_conf_indices)
len(train_tok_lowconf)


6734

# 10. Обучаем модель на Low-confidence 10%


In [10]:
model_lowconf, metrics_lowconf = train_model(
    model_name=model_name,
    train_dataset=train_tok_lowconf,
    val_dataset=val_tok,
    epochs=3,
    lr=2e-5,
    batch_size=32
)

metrics_lowconf


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
W1122 06:23:08.074000 140014858579008 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.accumulated_cache_size_limit (64)
W1122 06:23:08.074000 140014858579008 torch/_dynamo/convert_frame.py:357]    function: 'compiled_mlp' (/home/onbaev.baurzhan/source/project/.venv/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:528)
W1122 06:23:08.074000 140014858579008 torch/_dynamo/convert_frame.py:357]    last reason: __


Epoch 1
train_loss: 0.6134442184208694
{'val_loss': 1.3443306173597063, 'accuracy': 0.4518348623853211, 'f1': 0.6218354430379747}

Epoch 2
train_loss: 0.4624087393142601
{'val_loss': 1.9485717288085393, 'accuracy': 0.20756880733944955, 'f1': 0.252972972972973}

Epoch 3
train_loss: 0.2629198776955288
{'val_loss': 2.3707713527338847, 'accuracy': 0.2396788990825688, 'f1': 0.3480825958702065}


{'val_loss': 2.3707713527338847,
 'accuracy': 0.2396788990825688,
 'f1': 0.3480825958702065}