In [1]:
#!/usr/bin/env python
# coding: utf-8

# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda_manual_seed_all = SEED

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

# добавить src проекта
import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


# 2. Загрузка SST-2

In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

# 3. Токенизация полного train (нужна для скоринга)

In [3]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False
    )
    enc["label"] = batch["label"]
    return enc

train_tok_full = train_raw.map(tokenize_batch, batched=True, remove_columns=train_raw.column_names)
val_tok = val_raw.map(tokenize_batch, batched=True, remove_columns=val_raw.column_names)

len(train_tok_full), len(val_tok)


(67349, 872)

# 4. обучим базовую модель на случайных 10% чтобы получить скоринг-оценки

In [4]:
train_10 = train_raw.shuffle(seed=42).select(range(int(0.1 * len(train_raw))))
train_tok_10 = train_10.map(tokenize_batch, batched=True, remove_columns=train_10.column_names)

from train_utils import train_model

base_model, base_metrics = train_model(
    model_name=model_name,
    train_dataset=train_tok_10,
    val_dataset=val_tok,
    epochs=2,
    lr=2e-5,
    batch_size=32
)

base_metrics


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.3929419008511785
{'val_loss': 0.22405235016984598, 'accuracy': 0.9059633027522935, 'f1': 0.9076576576576577}

Epoch 2
train_loss: 0.16062389586950648
{'val_loss': 0.22300875539492285, 'accuracy': 0.9208715596330275, 'f1': 0.9198606271777003}


{'val_loss': 0.22300875539492285,
 'accuracy': 0.9208715596330275,
 'f1': 0.9198606271777003}

# 5. получаем p_gold для каждого примера полного train

In [5]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def make_loader_for_scoring(dataset, batch_size=64):
    def collate_fn(batch):
        enc = tokenizer.pad(
            {
                "input_ids": [x["input_ids"] for x in batch],
                "attention_mask": [x["attention_mask"] for x in batch],
            },
            return_tensors="pt",
        )
        labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)
        enc["labels"] = labels
        return enc
    return DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

scoring_loader = make_loader_for_scoring(train_tok_full, batch_size=64)


In [6]:
p_gold_list = []   # (global_idx, p_gold)

base_model.eval()
base_model.to("cuda")

with torch.no_grad():
    global_idx = 0
    for batch in scoring_loader:
        batch = {k: v.to("cuda") for k, v in batch.items()}  # перенос на GPU

        logits = base_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        ).logits

        probs = F.softmax(logits, dim=-1)
        labels = batch["labels"]

        for i in range(len(labels)):
            y = labels[i].item()
            p_gold = probs[i, y].item()
            p_gold_list.append((global_idx, p_gold))
            global_idx += 1

len(p_gold_list)


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


67349

# 6. Отбор top-10% по p_gold


In [7]:
# сортируем по уверенности по убыванию
p_gold_sorted = sorted(p_gold_list, key=lambda x: x[1], reverse=True)

M = int(0.1 * len(p_gold_sorted))
high_conf_indices = [idx for idx, _ in p_gold_sorted[:M]]

len(high_conf_indices), high_conf_indices[:10]


(6734, [4898, 45342, 9332, 11138, 20730, 47914, 6606, 56020, 35233, 31117])

# 7. Формируем high-confidence subset


In [8]:
train_tok_highconf = train_tok_full.select(high_conf_indices)
len(train_tok_highconf)


6734

# 8. Обучение финальной модели на High-confidence 10%

In [9]:
model_highconf, metrics_highconf = train_model(
    model_name=model_name,
    train_dataset=train_tok_highconf,
    val_dataset=val_tok,
    epochs=6,
    lr=2e-5,
    batch_size=32
)

metrics_highconf


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
W1122 06:43:48.091000 140380777315392 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.accumulated_cache_size_limit (64)
W1122 06:43:48.091000 140380777315392 torch/_dynamo/convert_frame.py:357]    function: 'compiled_mlp' (/home/onbaev.baurzhan/source/project/.venv/lib/python3.11/site-packages/transformers/models/modernbert/modeling_modernbert.py:528)
W1122 06:43:48.091000 140380777315392 torch/_dynamo/convert_frame.py:357]    last reason: __


Epoch 1
train_loss: 0.09469464158386066
{'val_loss': 0.4517337448362793, 'accuracy': 0.838302752293578, 'f1': 0.833530106257379}

Epoch 2
train_loss: 0.0014963871627682996
{'val_loss': 0.728474582146321, 'accuracy': 0.8165137614678899, 'f1': 0.7927461139896373}

Epoch 3
train_loss: 5.144302978323316e-05
{'val_loss': 0.7771362116826432, 'accuracy': 0.819954128440367, 'f1': 0.7974193548387096}

Epoch 4
train_loss: 2.7135464264040203e-05
{'val_loss': 0.8158349139349801, 'accuracy': 0.8222477064220184, 'f1': 0.8010269576379975}

Epoch 5
train_loss: 1.7762077936391567e-05
{'val_loss': 0.8417911894087281, 'accuracy': 0.8245412844036697, 'f1': 0.8040973111395646}

Epoch 6
train_loss: 1.2602813183578451e-05
{'val_loss': 0.86384023912251, 'accuracy': 0.8268348623853211, 'f1': 0.8071519795657727}


{'val_loss': 0.86384023912251,
 'accuracy': 0.8268348623853211,
 'f1': 0.8071519795657727}