# 1. Детерминизм + путь к проекту


In [1]:
# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


# 2. Загружаем SST-2


In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw   = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

# 3. Загружаем MiniLM encoder


In [3]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embedder = embedder.to("cuda")


# 4. Считаем эмбеддинги (384-d → 100 MB)


In [4]:
sentences = train_raw["sentence"]

batch_size = 256
embeddings = []

for i in range(0, len(sentences), batch_size):
    batch = sentences[i:i+batch_size]
    emb = embedder.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    embeddings.append(emb)

import numpy as np
emb_matrix = np.vstack(embeddings)
emb_matrix.shape


(67349, 384)

# 5. Random Projection: 384-d → 32-d


In [5]:
from sklearn.random_projection import GaussianRandomProjection

rp = GaussianRandomProjection(n_components=32, random_state=42)
emb_rp = rp.fit_transform(emb_matrix)

emb_rp.shape


(67349, 32)

# 6. KMeans на 32-d


In [6]:
from sklearn.cluster import KMeans

N = len(train_raw)
M = int(0.1 * N)

print("Clusters =", M)

kmeans = KMeans(
    n_clusters=M,
    random_state=42,
    max_iter=20,
    n_init="auto"
)

kmeans.fit(emb_rp)


Clusters = 6734


# 7. Выбор центроидных примеров


In [7]:
cluster_ids = kmeans.labels_
centroids = kmeans.cluster_centers_

selected_indices = []
for c in range(M):
    mask = np.where(cluster_ids == c)[0]
    cluster_embs = emb_rp[mask]

    centroid = centroids[c]
    dists = np.linalg.norm(cluster_embs - centroid, axis=1)
    best_local_idx = mask[np.argmin(dists)]

    selected_indices.append(best_local_idx)

len(selected_indices)


6734

# 8. Токенизация выбранного поднабора


In [8]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False,
    )
    enc["label"] = batch["label"]
    return enc

train_subset = train_raw.select(selected_indices)

train_tok = train_subset.map(
    tokenize_batch,
    batched=True,
    remove_columns=train_raw.column_names
)

val_tok = val_raw.map(
    tokenize_batch,
    batched=True,
    remove_columns=val_raw.column_names
)

len(train_tok)


Map: 100%|██████████| 6734/6734 [00:00<00:00, 16456.99 examples/s]


6734

# 9. Обучение финальной модели на RandomProjection+KMeans


In [9]:
from train_utils import train_model

model_kmeans, metrics_kmeans = train_model(
    model_name=model_name,
    train_dataset=train_tok,
    val_dataset=val_tok,
    epochs=4,
    lr=2e-5,
    batch_size=32
)

metrics_kmeans


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.34199140083168356
{'val_loss': 0.2386914092515196, 'accuracy': 0.9036697247706422, 'f1': 0.9066666666666666}

Epoch 2
train_loss: 0.12593385326936488
{'val_loss': 0.2304244933683159, 'accuracy': 0.9288990825688074, 'f1': 0.927570093457944}

Epoch 3
train_loss: 0.03962071552090303
{'val_loss': 0.2414191860921814, 'accuracy': 0.9208715596330275, 'f1': 0.9220338983050848}

Epoch 4
train_loss: 0.01447246634728006
{'val_loss': 0.33866861652183744, 'accuracy': 0.926605504587156, 'f1': 0.927437641723356}


{'val_loss': 0.33866861652183744,
 'accuracy': 0.926605504587156,
 'f1': 0.927437641723356}