# 1 — Full determinism + sys.path

In [1]:
# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


# 2 — Загрузка SST-2

In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw   = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

# 3 — Токенизация (нужна полная)

In [3]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False
    )
    enc["label"] = batch["label"]
    return enc

train_tok_full = train_raw.map(
    tokenize_batch,
    batched=True,
    remove_columns=train_raw.column_names
)

val_tok = val_raw.map(
    tokenize_batch,
    batched=True,
    remove_columns=val_raw.column_names
)

len(train_tok_full), len(val_tok)


(67349, 872)

# 4 — Формируем начальное seed-подмножество 5% Random

In [4]:
N_total = len(train_raw)
k5 = int(0.05 * N_total)

random_5 = train_raw.shuffle(seed=42).select(range(k5))

train_tok_5 = random_5.map(
    tokenize_batch,
    batched=True,
    remove_columns=random_5.column_names
)

len(train_tok_5)


3367

# 5 — Обучаем seed-модель на 5%

In [5]:
from train_utils import train_model

seed_model, seed_metrics = train_model(
    model_name=model_name,
    train_dataset=train_tok_5,
    val_dataset=val_tok,
    epochs=2,
    lr=2e-5,
    batch_size=32
)

seed_metrics


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.4792222325813095
{'val_loss': 0.2882754536611693, 'accuracy': 0.8727064220183486, 'f1': 0.8762541806020067}

Epoch 2
train_loss: 0.21002090205702018
{'val_loss': 0.24003498894827707, 'accuracy': 0.9128440366972477, 'f1': 0.9138321995464853}


{'val_loss': 0.24003498894827707,
 'accuracy': 0.9128440366972477,
 'f1': 0.9138321995464853}

# 6 — DataLoader для скоринга

In [6]:
from torch.utils.data import DataLoader
import torch

def make_loader_for_scoring(dataset, batch_size=64):
    def collate_fn(batch):
        enc = tokenizer.pad(
            {
                "input_ids": [x["input_ids"] for x in batch],
                "attention_mask": [x["attention_mask"] for x in batch],
            },
            return_tensors="pt",
        )
        labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)
        enc["labels"] = labels
        return enc
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

scoring_loader = make_loader_for_scoring(train_tok_full, batch_size=64)


# 7 — Считаем p_gold для каждого примера полного train

In [7]:
import torch.nn.functional as F

p_gold_list = []   # (idx, p_gold)

seed_model.eval()
seed_model.to("cuda")

with torch.no_grad():
    global_idx = 0
    for batch in scoring_loader:
        batch = {k: v.to("cuda") for k, v in batch.items()}

        logits = seed_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        ).logits

        probs = F.softmax(logits, dim=-1)
        labels = batch["labels"]

        for i in range(len(labels)):
            y = labels[i].item()
            p = probs[i, y].item()
            p_gold_list.append((global_idx, p))
            global_idx += 1

len(p_gold_list)


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


67349

# 8 — Убираем уже выбранные random 5% из поиска hard-примеров

In [8]:
random_5_indices = set(random_5["idx"] if "idx" in random_5.column_names else random_5.indices)
# datasets не всегда хранят индексы → используем позиционные
random_5_indices = set(range(k5))

# оставшиеся 95%
remaining = [(i, p) for (i, p) in p_gold_list if i not in random_5_indices]
len(remaining)


63982

# 9 — Выбираем bottom-5% hard примеров

In [9]:
k5_hard = k5  # ещё 5%

remaining_sorted = sorted(remaining, key=lambda x: x[1])  # low → first
hard_indices = [idx for idx, _ in remaining_sorted[:k5_hard]]

len(hard_indices), hard_indices[:10]


(3367, [41765, 23459, 37621, 46803, 36193, 7250, 60289, 52758, 53313, 63212])

# 10 — Формируем финальные 10% (5% Random + 5% Hard)

In [10]:
final_indices = list(random_5_indices) + hard_indices
len(final_indices)


6734

# 11 — Собираем финальный токенизированный датасет

In [11]:
train_tok_final = train_tok_full.select(final_indices)
len(train_tok_final)


6734

# 12 — Финальное обучение модели на гибридном поднаборе

In [12]:
model_final, metrics_final = train_model(
    model_name=model_name,
    train_dataset=train_tok_final,
    val_dataset=val_tok,
    epochs=4,
    lr=2e-5,
    batch_size=32
)

metrics_final


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
W1122 08:09:51.314000 140462576725056 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.accumulated_cache_size_limit (64)
W1122 08:09:51.314000 140462576725056 torch/_dynamo/convert_frame.py:357]    function: 'compiled_mlp' (/home/onbaev.baurzhan/source/project/.venv/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:528)
W1122 08:09:51.314000 140462576725056 torch/_dynamo/convert_frame.py:357]    last reason: __


Epoch 1
train_loss: 0.6760260202873375
{'val_loss': 0.5957657120057515, 'accuracy': 0.6857798165137615, 'f1': 0.5823170731707317}

Epoch 2
train_loss: 0.533841029445142
{'val_loss': 0.4595627263188362, 'accuracy': 0.7786697247706422, 'f1': 0.8149568552253116}

Epoch 3
train_loss: 0.38090297276092366
{'val_loss': 0.35970193839498926, 'accuracy': 0.8509174311926605, 'f1': 0.865979381443299}

Epoch 4
train_loss: 0.19470196716946447
{'val_loss': 0.4552712818341596, 'accuracy': 0.8291284403669725, 'f1': 0.8484231943031536}


{'val_loss': 0.4552712818341596,
 'accuracy': 0.8291284403669725,
 'f1': 0.8484231943031536}