In [None]:
!git clone https://huggingface.co/datasets/nyu-mll/glue
!git clone https://github.com/RefalMachine/llmtf_open
%cd llmtf_open

In [None]:
!ls

conversation_configs	      README.md
Dockerfile		      requirements.txt
eval_grammar.py		      run_evaluate_multinode_multigpu.py
evaluate_model.py	      run_evaluate_multinode_multigpu.sh
examples		      run_evaluate_singlenode_multigpu.sh
huggingface_tokenizers_cache  run_llm_as_a_judge.py
instruct_unsloth	      todo.txt
llm_as_a_judge_baselines      trained_qwen_model_lora
llmtf			      unsloth_compiled_cache


In [None]:
import random
import codecs
import torch
import json
import re
import copy
import numpy as np
import os
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForTokenClassification,
    AutoConfig,
)
from transformers import (
    Trainer,
    TrainingArguments,
    logging,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    BitsAndBytesConfig,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from unsloth import FastLanguageModel, UnslothTrainingArguments, UnslothTrainer

from peft import get_peft_model, LoraConfig
from peft import prepare_model_for_kbit_training


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, UnslothTrainingArguments, UnslothTrainer


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from typing import List, Dict

from torch.utils.data import Dataset


class ChatDataset(Dataset):
    def __init__(
        self,
        original_records: List[Dict],
        tokenizer: AutoTokenizer,
        max_tokens_count: int,
        sample_rate: float = 1.0,
        only_target_loss: bool = True,
        add_global_bos: bool = True,
        add_global_eos: bool = True,
        labels_pad_token_id: int = -100
    ):
        self.original_records = original_records
        self.sample_rate = sample_rate
        self.tokenizer = tokenizer
        self.max_tokens_count = max_tokens_count
        self.only_target_loss = only_target_loss
        self.labels_pad_token_id = labels_pad_token_id
        self.add_global_bos = add_global_bos
        self.add_global_eos = add_global_eos
        self.is_printed = False

        self.records = []
        for record in tqdm(original_records):
            # if random.random() > self.sample_rate:
            #     continue
            # не является необходимым в условиях задачи
            tensors = self.convert_record(record)
            if tensors is None:
                continue
            self.records.append(tensors)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        return self.records[index]

    def get_tokens(self, messages):
        #print(messages)
        tokens = self.tokenizer.apply_chat_template(
            messages,
            add_special_tokens=False,
            tokenize=True,
            add_generation_prompt=False,
        )
        if tokens[0] == self.tokenizer.bos_token_id:
            tokens = tokens[1:]
        # почему мы обрезаем токен начала строки?
        return tokens

    def convert_record(self, record):

        messages = []

        message_user = f"Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology and semantics. The answer should be one number: 0 or 1, where 0 means the sentence is not acceptable from the point of view of the English language, 1 means it is acceptable.\nText:{record['sentence']}"
        message_bot_train = f"Answer: {record['label']}"

        messages.append({'role': 'user', 'content': message_user})
        messages.append({'role': 'bot', 'content': message_bot_train})
        # к сообщению применяется токенизатор и устанавливается чат темплейт
        input_ids = self.get_tokens(messages)
        labels = input_ids

        # проверка на макс. длинну чата, установленную моделью
        if len(input_ids) > self.max_tokens_count - 2:
            return None
        # в этом блоке устанавливается внимание на весь чат/на ответ модели
        labels_mask = [
            self.labels_pad_token_id for _ in range(len(input_ids))
        ]
        if (
            self.only_target_loss
        ):
            message_labels = labels_mask


        if not input_ids:
            return None

        # не совсем понятно зачем это здесь
        # original_input_ids = self.get_tokens(record["sentence"])
        # if input_ids != original_input_ids[: len(input_ids)]:
        #     print(input_ids)
        #     print(original_input_ids[: len(input_ids)])
        # assert input_ids == original_input_ids[: len(input_ids)]

        # добавляем глабальный bos
        if self.add_global_bos and input_ids[0] != self.tokenizer.bos_token_id:
            input_ids.insert(0, self.tokenizer.bos_token_id)
            labels.insert(0, self.labels_pad_token_id)

        # обрезаем спецсивол после eos
        if input_ids[-2] == self.tokenizer.eos_token_id:
            input_ids = input_ids[:-1]
            labels = labels[:-1]

        # добавляем глобальный eos
        if self.add_global_eos and input_ids[-1] != self.tokenizer.eos_token_id:
            input_ids.append(self.tokenizer.eos_token_id)
            labels.append(self.tokenizer.eos_token_id)

        # выводим 1 из сообщений в датасете для сверки содержимого чат датасета
        if not self.is_printed:
            print(input_ids)
            print(labels)
            print(
                "Full prompt:" +
                self.tokenizer.decode(input_ids, skip_special_tokens=False)
            )
            assert '\n' in self.tokenizer.decode(input_ids, skip_special_tokens=False)
            self.is_printed = True


        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.new_ones(input_ids.size())
        assert (
            input_ids.size(0)
            == labels.size(0)
            == attention_mask.size(0)
            <= self.max_tokens_count
        )
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }

In [None]:
from datasets import load_dataset
dataset = load_dataset('nyu-mll/glue','cola')
dataset


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [None]:
dataset = dataset['train'].select(range(1000))
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 900
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 100
    })
})

In [None]:
os.environ["WANDB_DISABLED"] = "true"
# отключаем сервис WandB что бы он не собирал статистику

In [None]:
# используем чат темплейт от Ruadapt версии так как он не прокидывает system часть в промпт
tokenizer = AutoTokenizer.from_pretrained('RefalMachine/RuadaptQwen2.5-7B-Lite-Beta')
chat_template = tokenizer.chat_template
chat_template

'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- endif %}\n{%- en

In [None]:
model_name = 'Qwen/Qwen2.5-1.5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# загружаем инструктивную модель unsloth
max_tokens_count = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_tokens_count,
    dtype=torch.float16,
    load_in_4bit=True,
    attn_implementation="sdpa",
)

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.chat_template = chat_template
tokenizer.padding_side = 'left'
# устанавливаем токены и чат темплейт

In [None]:
only_target_loss = True

datasets = []
for split in ('train','test'):
    datasets.append(
        ChatDataset(
            dataset[split],
            tokenizer,
            max_tokens_count=max_tokens_count,
            sample_rate=1.0,
            only_target_loss=only_target_loss,
            add_global_eos=False,
            add_global_bos=False
        )
    )
train_dataset, val_dataset = datasets

 29%|██▉       | 263/900 [00:00<00:00, 1334.70it/s]

[151644, 872, 198, 7771, 3383, 374, 311, 8253, 279, 4193, 2897, 315, 279, 1467, 369, 279, 6364, 4128, 304, 3793, 315, 19482, 11, 78512, 323, 52694, 13, 576, 4226, 1265, 387, 825, 1372, 25, 220, 15, 476, 220, 16, 11, 1380, 220, 15, 3363, 279, 11652, 374, 537, 21555, 504, 279, 1459, 315, 1651, 315, 279, 6364, 4128, 11, 220, 16, 3363, 432, 374, 21555, 624, 1178, 57155, 2908, 1052, 311, 387, 264, 883, 304, 429, 13551, 13, 151645]
[151644, 872, 198, 7771, 3383, 374, 311, 8253, 279, 4193, 2897, 315, 279, 1467, 369, 279, 6364, 4128, 304, 3793, 315, 19482, 11, 78512, 323, 52694, 13, 576, 4226, 1265, 387, 825, 1372, 25, 220, 15, 476, 220, 16, 11, 1380, 220, 15, 3363, 279, 11652, 374, 537, 21555, 504, 279, 1459, 315, 1651, 315, 279, 6364, 4128, 11, 220, 16, 3363, 432, 374, 21555, 624, 1178, 57155, 2908, 1052, 311, 387, 264, 883, 304, 429, 13551, 13, 151645]
Full prompt:<|im_start|>user
Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology

100%|██████████| 900/900 [00:00<00:00, 1349.03it/s]
100%|██████████| 100/100 [00:00<00:00, 1429.45it/s]

[151644, 872, 198, 7771, 3383, 374, 311, 8253, 279, 4193, 2897, 315, 279, 1467, 369, 279, 6364, 4128, 304, 3793, 315, 19482, 11, 78512, 323, 52694, 13, 576, 4226, 1265, 387, 825, 1372, 25, 220, 15, 476, 220, 16, 11, 1380, 220, 15, 3363, 279, 11652, 374, 537, 21555, 504, 279, 1459, 315, 1651, 315, 279, 6364, 4128, 11, 220, 16, 3363, 432, 374, 21555, 624, 1178, 57155, 4961, 3432, 58234, 10244, 13, 151645]
[151644, 872, 198, 7771, 3383, 374, 311, 8253, 279, 4193, 2897, 315, 279, 1467, 369, 279, 6364, 4128, 304, 3793, 315, 19482, 11, 78512, 323, 52694, 13, 576, 4226, 1265, 387, 825, 1372, 25, 220, 15, 476, 220, 16, 11, 1380, 220, 15, 3363, 279, 11652, 374, 537, 21555, 504, 279, 1459, 315, 1651, 315, 279, 6364, 4128, 11, 220, 16, 3363, 432, 374, 21555, 624, 1178, 57155, 4961, 3432, 58234, 10244, 13, 151645]
Full prompt:<|im_start|>user
Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology and semantics. The answer should be one numbe




In [None]:
from transformers import GenerationConfig

# def generate(messages, model, tokenizer, generation_config):
#     print(tokenizer.apply_chat_template(messages, add_special_tokens=False, add_generation_prompt=True, tokenize=False))
#     input_ids = tokenizer.apply_chat_template(messages, return_tensors='pt', add_special_tokens=False, add_generation_prompt=True)
#     input_ids = input_ids.to(model.device)
#     with torch.no_grad():
#         output_ids = model.generate(
#             input_ids,
#             generation_config=generation_config
#         )
#     outputs = []
#     for sample_output_ids, sample_input_ids in zip(output_ids, input_ids):
#         sample_output_ids = sample_output_ids[len(sample_input_ids):]
#         sample_output = tokenizer.decode(sample_output_ids, skip_special_tokens=True)
#         outputs.append(sample_output)

#     if len(outputs) == 1:
#         outputs = outputs[0]
#     return outputs



generation_config = GenerationConfig.from_dict(
    {
        'top_k': 20,
        'top_p': 0.8,
        'temperature': 0.1,
        'repetition_penalty': 1.0,
        'max_new_tokens': 64,
        'do_sample': True,
        'pad_token_id': tokenizer.pad_token_id,
        'bos_token_id': tokenizer.bos_token_id,
        'eos_token_id': tokenizer.eos_token_id
    }
)
generation_config

GenerationConfig {
  "do_sample": true,
  "eos_token_id": 151645,
  "max_new_tokens": 64,
  "pad_token_id": 151643,
  "temperature": 0.1,
  "top_k": 20,
  "top_p": 0.8
}

In [None]:
lora_config = {
    "r": 32,
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    "bias": "none",
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
    "use_gradient_checkpointing": "unsloth"
}

In [None]:
model = FastLanguageModel.get_peft_model(
    model, **lora_config, max_seq_length=max_tokens_count
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 0 MLP layers.


In [None]:
training_args = {
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "eval_steps": 16,
    "save_steps": 128,
    "logging_steps": 16,
    "learning_rate": 0.00005,
    "num_train_epochs": 1,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 16,
    "bf16": False,
    "fp16": True,
    "optim": "paged_adamw_8bit",
    "save_total_limit": 1,
    "seed": 1337,
    "max_grad_norm": 1.0,
    "weight_decay": 0.05
}
training_args = UnslothTrainingArguments(output_dir='./instruct_unsloth', **training_args)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from unsloth.trainer import _create_unsloth_optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
        if embedding_learning_rate is None:
            return super().create_optimizer()
        if self.optimizer is None:
            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
            self.optimizer = _create_unsloth_optimizer(
                self.model,
                optimizer_cls,
                optimizer_kwargs,
                embedding_learning_rate,
            )
        return self.optimizer

In [None]:
# Занимает в памяти всего 3.5GB для 1.5B модели при обучении!
trainer = data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 900 | Num Epochs = 1 | Total steps = 112
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 8,716,288/5,000,000,000 (0.17% trained)


Step,Training Loss
16,2.8784
32,2.3801
48,1.7423
64,1.1623
80,0.8846
96,0.8336
112,0.8443


TrainOutput(global_step=112, training_loss=1.532220951148442, metrics={'train_runtime': 528.6191, 'train_samples_per_second': 1.703, 'train_steps_per_second': 0.212, 'total_flos': 585597947092992.0, 'train_loss': 1.532220951148442, 'epoch': 0.9955555555555555})

In [None]:
model.save_pretrained('./trained_qwen_model_lora')

In [None]:
tokenizer.save_pretrained('./trained_qwen_model_lora')

('./trained_qwen_model_lora/tokenizer_config.json',
 './trained_qwen_model_lora/special_tokens_map.json',
 './trained_qwen_model_lora/vocab.json',
 './trained_qwen_model_lora/merges.txt',
 './trained_qwen_model_lora/added_tokens.json',
 './trained_qwen_model_lora/tokenizer.json')

In [None]:
!ls trained_qwen_model_lora

adapter_config.json	   merges.txt		    tokenizer_config.json
adapter_model.safetensors  README.md		    tokenizer.json
added_tokens.json	   special_tokens_map.json  vocab.json


Обязательно перезапустить сеанс


In [None]:
%cd llmtf_open

/content/llmtf_open


In [None]:
from llmtf.model import HFModel
from typing import Dict, List, Tuple
from llmtf.metrics import mean
from llmtf.base import SimpleFewShotHFTask
from sklearn.metrics import matthews_corrcoef

In [None]:
class GlueColaTask(SimpleFewShotHFTask):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.method = 'calculate_tokens_proba'
        self._max_new_tokens = 1

    @classmethod
    def name(cls):
        return 'glue/cola'

    @property
    def choices(self):
        return ["0", "1"]

    def aggregation(self) -> Dict:
        return {"acc": mean, "mcc": lambda data: matthews_corrcoef([d[0] for d in data],[d[1] for d in data])}

    def dataset_args(self) -> Dict:
        return {'path': '../glue/cola'}


    def evaluate(self, sample, y_pred) -> Dict:
        y_true = str(sample['label'])
        y_pred = sorted([pair for pair in y_pred.items()], key=lambda x: -x[1])[0][0]
        return {"acc": y_true == y_pred, "mcc": [y_true, y_pred]}

    def test_split_name(self) -> str:
        return 'validation'

    def prompt_split_name(self) -> str:
        return 'train'

    def create_messages(self, sample, with_answer=False) -> List[Dict]:

        messages = []

        instruction_user = "Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology and semantics. The answer should be one number: 0 or 1, where 0 means the sentence is not acceptable from the point of view of the English language, 1 means it is acceptable.\nText:{sentence}"
        instruction_bot = "Answer:"

        messages.append({'role': 'user', 'content': instruction_user.format(**sample)})
        messages.append({'role': 'bot', 'content': instruction_bot})

        return messages

In [None]:
task = GlueColaTask()

In [None]:
model_name = "./trained_qwen_model_lora"
model = HFModel(device_map="cuda",attn_implementation="sdpa")
model.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
100%|██████████| 151665/151665 [00:00<00:00, 284132.59it/s]
INFO: 2025-04-18 00:47:22,104: llmtf.base.hfmodel: Set eos_token_id in generation_config to [151645]
INFO:llmtf.base.hfmodel:Set eos_token_id in generation_config to [151645]
INFO: 2025-04-18 00:47:22,114: llmtf.base.hfmodel: Model id: ./trained_qwen_model_lora
INFO:llmtf.base.hfmodel:Model id: ./trained_qwen_model_lora
INFO: 2025-04-18 00:47:22,118: llmtf.base

In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = HFModel(device_map="cuda",attn_implementation="sdpa")
model.from_pretrained(model_name)

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

FORCES SYSTEM PROMPT AT START=<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>



100%|██████████| 151665/151665 [00:00<00:00, 283161.50it/s]
INFO: 2025-04-18 00:30:04,257: llmtf.base.hfmodel: Set eos_token_id in generation_config to [151645]
INFO:llmtf.base.hfmodel:Set eos_token_id in generation_config to [151645]
INFO: 2025-04-18 00:30:04,268: llmtf.base.hfmodel: Model id: Qwen/Qwen2.5-1.5B-Instruct
INFO:llmtf.base.hfmodel:Model id: Qwen/Qwen2.5-1.5B-Instruct
INFO: 2025-04-18 00:30:04,272: llmtf.base.hfmodel: Leading space: False
INFO:llmtf.base.hfmodel:Leading space: False


In [None]:
from llmtf.evaluator import Evaluator
evaluator = Evaluator()

evaluator.evaluate_dataset(
    task=task,
    model=model,
    output_dir='../cola-qwen-no_finetune',
    max_len=4000,
    few_shot_count=0,
    generation_config=None, # will use model.generation_config by default
    batch_size=4,
    max_sample_per_dataset=200
)

INFO: 2025-04-18 00:47:22,171: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [151645]
INFO:llmtf.base.hfmodel:Updated generation_config.eos_token_id: [151645]
INFO: 2025-04-18 00:47:22,174: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
INFO:llmtf.base.hfmodel:Updated generation_config.stop_strings: ['<|im_end|>']
100%|██████████| 200/200 [00:00<00:00, 1315.18it/s]
INFO: 2025-04-18 00:47:22,398: llmtf.base.glue/cola: Loading Dataset: 0.22s
INFO:llmtf.base.glue/cola:Loading Dataset: 0.22s
100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
INFO: 2025-04-18 00:47:48,168: llmtf.base.glue/cola: Processing Dataset: 25.77s
INFO:llmtf.base.glue/cola:Processing Dataset: 25.77s
INFO: 2025-04-18 00:47:48,170: llmtf.base.glue/cola: Results for glue/cola:
INFO:llmtf.base.glue/cola:Results for glue/cola:
INFO: 2025-04-18 00:47:48,190: llmtf.base.glue/cola: {'acc': 0.765, 'mcc': 0.5142362160741519}
INFO:llmtf.base.glue/cola:{'acc': 0.765, 'mcc': 0.51423621607415

In [None]:
!cat ../cola-qwen-clear/glue_cola_total.jsonl

{
    "task_name": "glue/cola",
    "results": {
        "acc": 0.745,
        "mcc": 0.4985588860372235
    },
    "leaderboard_result": 0.6217794430186118
}


In [None]:
!cat ../cola-qwen-lora-16/glue_cola_total.jsonl

{
    "task_name": "glue/cola",
    "results": {
        "acc": 0.76,
        "mcc": 0.5110628565334214
    },
    "leaderboard_result": 0.6355314282667107
}


In [None]:
!cat ../cola-qwen-lora-32/glue_cola_total.jsonl

{
    "task_name": "glue/cola",
    "results": {
        "acc": 0.765,
        "mcc": 0.5142362160741519
    },
    "leaderboard_result": 0.639618108037076
}


Эту проблему надо решить, я не понимаю, почему она возникает.
В качестве решения я закомментил assert. Метрики на обычной модели совпали с действительными.

In [None]:
from llmtf.evaluator import Evaluator
evaluator = Evaluator()

evaluator.evaluate_dataset(
    task=task,
    model=model,
    output_dir='../cola-qwen-no_l',
    max_len=4000,
    few_shot_count=0,
    generation_config=None, # will use model.generation_config by default
    batch_size=4,
    max_sample_per_dataset=200
)

INFO: 2025-04-18 00:24:48,840: llmtf.base.hfmodel: Updated generation_config.eos_token_id: [151645]
INFO:llmtf.base.hfmodel:Updated generation_config.eos_token_id: [151645]
INFO: 2025-04-18 00:24:48,842: llmtf.base.hfmodel: Updated generation_config.stop_strings: ['<|im_end|>']
INFO:llmtf.base.hfmodel:Updated generation_config.stop_strings: ['<|im_end|>']
100%|██████████| 200/200 [00:00<00:00, 1269.37it/s]
INFO: 2025-04-18 00:24:49,059: llmtf.base.glue/cola: Loading Dataset: 0.21s
INFO:llmtf.base.glue/cola:Loading Dataset: 0.21s
 14%|█▍        | 7/50 [00:04<00:26,  1.63it/s]
INFO: 2025-04-18 00:24:53,350: llmtf.base.glue/cola: Processing Dataset: 4.29s
INFO:llmtf.base.glue/cola:Processing Dataset: 4.29s


AssertionError: 