In [1]:
# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

# подключаем твой src
import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw   = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

In [3]:
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False
    )
    enc["label"] = batch["label"]
    return enc

train_tok = train_raw.map(tokenize_batch, batched=True, remove_columns=train_raw.column_names)
val_tok   = val_raw.map(tokenize_batch, batched=True, remove_columns=val_raw.column_names)

len(train_tok), len(val_tok)


(67349, 872)

In [4]:
from train_utils import train_model

model, metrics = train_model(
    model_name=model_name,
    train_dataset=train_tok,
    val_dataset=val_tok,
    epochs=4,
    lr=2e-5,
    batch_size=32
)

metrics


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.19739927762230047
{'val_loss': 0.18448906331988318, 'accuracy': 0.9415137614678899, 'f1': 0.9435215946843853}

Epoch 2
train_loss: 0.1042055811290574
{'val_loss': 0.15761764773300715, 'accuracy': 0.9438073394495413, 'f1': 0.9437428243398392}

Epoch 3
train_loss: 0.0653693499902458
{'val_loss': 0.16280603259136633, 'accuracy': 0.9506880733944955, 'f1': 0.9516310461192351}

Epoch 4
train_loss: 0.04414849133782267
{'val_loss': 0.20327113156339952, 'accuracy': 0.9426605504587156, 'f1': 0.9449339207048458}


{'val_loss': 0.20327113156339952,
 'accuracy': 0.9426605504587156,
 'f1': 0.9449339207048458}