In [1]:
# --- FULL DETERMINISM BLOCK ---

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["PYTHONHASHSEED"] = "42"
os.environ["FLASH_ATTENTION_USE_DETERMINISTIC"] = "1"

import random
import numpy as np
import torch

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True, warn_only=True)

# путь к проекту
import sys
sys.path.append("/home/onbaev.baurzhan/source/project/src")

print("Deterministic init done.")


Deterministic init done.


In [2]:
from datasets import load_dataset

ds = load_dataset("glue", "sst2")
train_raw = ds["train"]
val_raw = ds["validation"]

len(train_raw), len(val_raw)


  from .autonotebook import tqdm as notebook_tqdm


(67349, 872)

In [3]:
train_10 = train_raw.shuffle(seed=42).select(range(int(0.1 * len(train_raw))))
len(train_10)


6734

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base", trust_remote_code=True)

def tokenize_batch(batch):
    enc = tokenizer(
        batch["sentence"],
        truncation=True,
        padding=False,
    )
    enc["label"] = batch["label"]
    return enc

train_tok = train_10.map(tokenize_batch, batched=True, remove_columns=train_10.column_names)
val_tok   = val_raw.map(tokenize_batch, batched=True, remove_columns=val_raw.column_names)


In [5]:
from train_utils import train_model

model_name = "answerdotai/ModernBERT-base"

model, metrics = train_model(
    model_name=model_name,
    train_dataset=train_tok,
    val_dataset=val_tok,
    epochs=3,
    lr=2e-5,
    batch_size=32
)

metrics


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



Epoch 1
train_loss: 0.3929419008511785
{'val_loss': 0.22405235016984598, 'accuracy': 0.9059633027522935, 'f1': 0.9076576576576577}

Epoch 2
train_loss: 0.16062389586950648
{'val_loss': 0.22300875539492285, 'accuracy': 0.9208715596330275, 'f1': 0.9198606271777003}

Epoch 3
train_loss: 0.05928887555682458
{'val_loss': 0.3417216074386878, 'accuracy': 0.9139908256880734, 'f1': 0.9146757679180887}


{'val_loss': 0.3417216074386878,
 'accuracy': 0.9139908256880734,
 'f1': 0.9146757679180887}

In [6]:
model.save_pretrained("../models/random10_scorer")
tokenizer.save_pretrained("../models/random10_scorer")


('../models/random10_scorer/tokenizer_config.json',
 '../models/random10_scorer/special_tokens_map.json',
 '../models/random10_scorer/tokenizer.json')