In [None]:
!pip install accelerate
!pip install transformers
!pip install bitsandbytes
!pip install datasets
!pip install rouge-score
!pip install pymorphy3
!pip install peft
!pip install unsloth
!pip install flash_attn

^C
^C
^C
^C


In [None]:
!git clone https://huggingface.co/datasets/nyu-mll/glue
!git clone https://github.com/RefalMachine/llmtf_open
%cd llmtf_open

In [None]:
import random
import codecs
import torch
import json
import re
import copy
import numpy as np
import os
from tqdm import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForTokenClassification,
    AutoConfig,
)
from transformers import (
    Trainer,
    TrainingArguments,
    logging,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    BitsAndBytesConfig,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from unsloth import FastLanguageModel, UnslothTrainingArguments, UnslothTrainer

from peft import get_peft_model, LoraConfig
from peft import prepare_model_for_kbit_training

In [None]:
from typing import List, Dict

from torch.utils.data import Dataset


class ChatDataset(Dataset):
    def __init__(
        self,
        original_records: List[Dict],
        tokenizer: AutoTokenizer,
        max_tokens_count: int,
        sample_rate: float = 1.0,
        only_target_loss: bool = True,
        add_global_bos: bool = True,
        add_global_eos: bool = True,
        labels_pad_token_id: int = -100
    ):
        self.original_records = original_records
        self.sample_rate = sample_rate
        self.tokenizer = tokenizer
        self.max_tokens_count = max_tokens_count
        self.only_target_loss = only_target_loss
        self.labels_pad_token_id = labels_pad_token_id
        self.add_global_bos = add_global_bos
        self.add_global_eos = add_global_eos
        self.is_printed = False

        self.records = []
        for record in tqdm(original_records):
            # if random.random() > self.sample_rate:
            #     continue
            # не является необходимым в условиях задачи
            tensors = self.convert_record(record)
            if tensors is None:
                continue
            self.records.append(tensors)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        return self.records[index]

    def get_tokens(self, messages):
        #print(messages)
        tokens = self.tokenizer.apply_chat_template(
            messages,
            add_special_tokens=False,
            tokenize=True,
            add_generation_prompt=False,
        )
        if tokens[0] == self.tokenizer.bos_token_id:
            tokens = tokens[1:]
        # почему мы обрезаем токен начала строки?
        return tokens

    def convert_record(self, record):

        messages = []

        message_user = f"Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology and semantics. The answer should be one number: 0 or 1, where 0 means the sentence is not acceptable from the point of view of the English language, 1 means it is acceptable.\nText:{record['sentence']}"
        message_bot_train = f"Answer: {record['label']}"

        messages.append({'role': 'user', 'content': message_user})
        messages.append({'role': 'bot', 'content': message_bot_train})
        # к сообщению применяется токенизатор и устанавливается чат темплейт
        input_ids = self.get_tokens(messages)
        labels = input_ids

        # проверка на макс. длинну чата, установленную моделью
        if len(input_ids) > self.max_tokens_count - 2:
            return None
        # в этом блоке устанавливается внимание на весь чат/на ответ модели
        labels_mask = [
            self.labels_pad_token_id for _ in range(len(input_ids))
        ]
        if (
            self.only_target_loss
        ):
            message_labels = labels_mask


        if not input_ids:
            return None

        # не совсем понятно зачем это здесь
        # original_input_ids = self.get_tokens(record["sentence"])
        # if input_ids != original_input_ids[: len(input_ids)]:
        #     print(input_ids)
        #     print(original_input_ids[: len(input_ids)])
        # assert input_ids == original_input_ids[: len(input_ids)]

        # добавляем глабальный bos
        if self.add_global_bos and input_ids[0] != self.tokenizer.bos_token_id:
            input_ids.insert(0, self.tokenizer.bos_token_id)
            labels.insert(0, self.labels_pad_token_id)

        # обрезаем спецсивол после eos
        if input_ids[-2] == self.tokenizer.eos_token_id:
            input_ids = input_ids[:-1]
            labels = labels[:-1]

        # добавляем глобальный eos
        if self.add_global_eos and input_ids[-1] != self.tokenizer.eos_token_id:
            input_ids.append(self.tokenizer.eos_token_id)
            labels.append(self.tokenizer.eos_token_id)

        # выводим 1 из сообщений в датасете для сверки содержимого чат датасета
        if not self.is_printed:
            print(input_ids)
            print(labels)
            print(
                "Full prompt:" +
                self.tokenizer.decode(input_ids, skip_special_tokens=False)
            )
            assert '\n' in self.tokenizer.decode(input_ids, skip_special_tokens=False)
            self.is_printed = True


        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.new_ones(input_ids.size())
        assert (
            input_ids.size(0)
            == labels.size(0)
            == attention_mask.size(0)
            <= self.max_tokens_count
        )
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }

In [None]:
from datasets import load_dataset
dataset = load_dataset('nyu-mll/glue','cola')
dataset


In [None]:
dataset = dataset['train'].select(range(1000))
dataset = dataset.train_test_split(test_size=0.1)
dataset

In [None]:
os.environ["WANDB_DISABLED"] = "true"
# отключаем сервис WandB что бы он не собирал статистику

In [None]:
# используем чат темплейт от Ruadapt версии так как он не прокидывает system часть в промпт
tokenizer = AutoTokenizer.from_pretrained('RefalMachine/RuadaptQwen2.5-7B-Lite-Beta')
chat_template = tokenizer.chat_template
chat_template

In [None]:
model_name = 'Qwen/Qwen2.5-1.5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# загружаем инструктивную модель unsloth
max_tokens_count = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_tokens_count,
    dtype=torch.float16,
    load_in_4bit=True,
    attn_implementation="sdpa",
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.chat_template = chat_template
tokenizer.padding_side = 'left'
# устанавливаем токены и чат темплейт

In [None]:
only_target_loss = True

datasets = []
for split in ('train','test'):
    datasets.append(
        ChatDataset(
            dataset[split],
            tokenizer,
            max_tokens_count=max_tokens_count,
            sample_rate=1.0,
            only_target_loss=only_target_loss,
            add_global_eos=False,
            add_global_bos=False
        )
    )
train_dataset, val_dataset = datasets

In [None]:
from transformers import GenerationConfig

# def generate(messages, model, tokenizer, generation_config):
#     print(tokenizer.apply_chat_template(messages, add_special_tokens=False, add_generation_prompt=True, tokenize=False))
#     input_ids = tokenizer.apply_chat_template(messages, return_tensors='pt', add_special_tokens=False, add_generation_prompt=True)
#     input_ids = input_ids.to(model.device)
#     with torch.no_grad():
#         output_ids = model.generate(
#             input_ids,
#             generation_config=generation_config
#         )
#     outputs = []
#     for sample_output_ids, sample_input_ids in zip(output_ids, input_ids):
#         sample_output_ids = sample_output_ids[len(sample_input_ids):]
#         sample_output = tokenizer.decode(sample_output_ids, skip_special_tokens=True)
#         outputs.append(sample_output)

#     if len(outputs) == 1:
#         outputs = outputs[0]
#     return outputs



generation_config = GenerationConfig.from_dict(
    {
        'top_k': 20,
        'top_p': 0.8,
        'temperature': 0.1,
        'repetition_penalty': 1.0,
        'max_new_tokens': 64,
        'do_sample': True,
        'pad_token_id': tokenizer.pad_token_id,
        'bos_token_id': tokenizer.bos_token_id,
        'eos_token_id': tokenizer.eos_token_id
    }
)
generation_config

In [None]:
lora_config = {
    "r": 32,
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    "bias": "none",
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
    "use_gradient_checkpointing": "unsloth"
}

In [None]:
model = FastLanguageModel.get_peft_model(
    model, **lora_config, max_seq_length=max_tokens_count
)

In [None]:
training_args = {
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "eval_steps": 16,
    "save_steps": 128,
    "logging_steps": 16,
    "learning_rate": 0.00005,
    "num_train_epochs": 1,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 16,
    "bf16": False,
    "fp16": True,
    "optim": "paged_adamw_8bit",
    "save_total_limit": 1,
    "seed": 1337,
    "max_grad_norm": 1.0,
    "weight_decay": 0.05
}
training_args = UnslothTrainingArguments(output_dir='./instruct_unsloth', **training_args)

In [None]:
from unsloth.trainer import _create_unsloth_optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
        if embedding_learning_rate is None:
            return super().create_optimizer()
        if self.optimizer is None:
            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
            self.optimizer = _create_unsloth_optimizer(
                self.model,
                optimizer_cls,
                optimizer_kwargs,
                embedding_learning_rate,
            )
        return self.optimizer

In [None]:
# Занимает в памяти всего 3.5GB для 1.5B модели при обучении!
trainer = data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)
trainer.train()

In [None]:
model.save_pretrained('./trained_qwen_model_lora')

In [None]:
tokenizer.save_pretrained('./trained_qwen_model_lora')

In [None]:
!ls trained_qwen_model_lora

Обязательно перезапустить сеанс


In [None]:
%cd llmtf_open

In [None]:
from llmtf.model import HFModel
from typing import Dict, List, Tuple
from llmtf.metrics import mean
from llmtf.base import SimpleFewShotHFTask
from sklearn.metrics import matthews_corrcoef

In [None]:
class GlueColaTask(SimpleFewShotHFTask):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.method = 'calculate_tokens_proba'
        self._max_new_tokens = 1

    @classmethod
    def name(cls):
        return 'glue/cola'

    @property
    def choices(self):
        return ["0", "1"]

    def aggregation(self) -> Dict:
        return {"acc": mean, "mcc": lambda data: matthews_corrcoef([d[0] for d in data],[d[1] for d in data])}

    def dataset_args(self) -> Dict:
        return {'path': '../glue/cola'}


    def evaluate(self, sample, y_pred) -> Dict:
        y_true = str(sample['label'])
        y_pred = sorted([pair for pair in y_pred.items()], key=lambda x: -x[1])[0][0]
        return {"acc": y_true == y_pred, "mcc": [y_true, y_pred]}

    def test_split_name(self) -> str:
        return 'validation'

    def prompt_split_name(self) -> str:
        return 'train'

    def create_messages(self, sample, with_answer=False) -> List[Dict]:

        messages = []

        instruction_user = "Your task is to determine the acceptability of the text for the English language in terms of syntax, morphology and semantics. The answer should be one number: 0 or 1, where 0 means the sentence is not acceptable from the point of view of the English language, 1 means it is acceptable.\nText:{sentence}"
        instruction_bot = "Answer:"

        messages.append({'role': 'user', 'content': instruction_user.format(**sample)})
        messages.append({'role': 'bot', 'content': instruction_bot})

        return messages

In [None]:
task = GlueColaTask()

In [None]:
model_name = "./trained_qwen_model_lora"
model = HFModel(device_map="cuda",attn_implementation="sdpa")
model.from_pretrained(model_name)

In [None]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = HFModel(device_map="cuda",attn_implementation="sdpa")
model.from_pretrained(model_name)

In [None]:
from llmtf.evaluator import Evaluator
evaluator = Evaluator()

evaluator.evaluate_dataset(
    task=task,
    model=model,
    output_dir='../cola-qwen-no_finetune',
    max_len=4000,
    few_shot_count=0,
    generation_config=None, # will use model.generation_config by default
    batch_size=4,
    max_sample_per_dataset=200
)

In [None]:
!cat ../cola-qwen-clear/glue_cola_total.jsonl

In [None]:
!cat ../cola-qwen-lora-16/glue_cola_total.jsonl

In [None]:
!cat ../cola-qwen-lora-32/glue_cola_total.jsonl

Эту проблему надо решить, я не понимаю, почему она возникает.
В качестве решения я закомментил assert. Метрики на обычной модели совпали с действительными.

In [None]:
from llmtf.evaluator import Evaluator
evaluator = Evaluator()

evaluator.evaluate_dataset(
    task=task,
    model=model,
    output_dir='../cola-qwen-no_l',
    max_len=4000,
    few_shot_count=0,
    generation_config=None, # will use model.generation_config by default
    batch_size=4,
    max_sample_per_dataset=200
)