### Imports & Definitions

In [8]:
import sys
sys.path.append("../../utils")
from definitions import *
from path_helpers import get_dataset_path, get_metric_dir_path
from mera_helpers import construct_prompt
from llm_helpers import get_answer

### Try to run Saiga 

In [3]:
%pip install --upgrade accelerate==0.21.0 \
  bitsandbytes==0.40.2 \
  peft==0.5.0 \
  transformers==4.34.0 \
  sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.34.0)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.22.2
    Uninstalling huggingface-hub-0.22.2:
      Successfully uninstalled huggingface-hub-0.22.2
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.19.0 requires huggingface-hub>=0.21.2, but you have huggingface-hub 0.17.3 which is incompatible.
diffusers 0.27.2 requires huggingface-hub>=0.20

In [3]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

MODEL_NAME = "IlyaGusev/saiga_mistral_7b"
DEFAULT_MESSAGE_TEMPLATE = "<s>{role}\n{content}</s>"
DEFAULT_RESPONSE_TEMPLATE = "<s>bot\n"
DEFAULT_SYSTEM_PROMPT = "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."

class Conversation:
    def __init__(
        self,
        message_template=DEFAULT_MESSAGE_TEMPLATE,
        system_prompt=DEFAULT_SYSTEM_PROMPT,
        response_template=DEFAULT_RESPONSE_TEMPLATE
    ):
        self.message_template = message_template
        self.response_template = response_template
        self.messages = [{
            "role": "system",
            "content": system_prompt
        }]

    def add_user_message(self, message):
        self.messages.append({
            "role": "user",
            "content": message
        })

    def add_bot_message(self, message):
        self.messages.append({
            "role": "bot",
            "content": message
        })

    def get_prompt(self, tokenizer):
        final_text = ""
        for message in self.messages:
            message_text = self.message_template.format(**message)
            final_text += message_text
        final_text += DEFAULT_RESPONSE_TEMPLATE
        return final_text.strip()


def generate(model, tokenizer, prompt, generation_config):
    data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    data = {k: v.to(model.device) for k, v in data.items()}
    output_ids = model.generate(
        **data,
        generation_config=generation_config
    )[0]
    output_ids = output_ids[len(data["input_ids"][0]):]
    output = tokenizer.decode(output_ids, skip_special_tokens=True)
    return output.strip()

config = PeftConfig.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
model = PeftModel.from_pretrained(
    model,
    MODEL_NAME,
    torch_dtype=torch.float16
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
print(generation_config)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.ut

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_new_tokens": 1536,
  "no_repeat_ngram_size": 15,
  "pad_token_id": 0,
  "repetition_penalty": 1.1,
  "temperature": 0.2,
  "top_k": 40,
  "top_p": 0.9
}



In [11]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization",
)

inputs = ["Почему трава зеленая?", "Сочини длинный рассказ, обязательно упоминая следующие объекты. Дано: Таня, мяч"]
for inp in inputs:
    conversation = Conversation()
    conversation.add_user_message(inp)
    prompt = conversation.get_prompt(tokenizer)
    
    # print(model(prompt))

    output = generate(model, tokenizer, prompt, generation_config)
    print(inp)
    print(output)
    print()
    print("==============================")
    print()


Почему трава зеленая?
Вопрос о цвете травы зависит от многих факторов, включая вид растения, условия окружающей среды, время года и т. д. Однако основной принцип заключается в том, что трава обычно зелёная из-за наличия хлорофилла.

Хлорофилл - это пигмент, который находится в клетках растений и используется для фотосинтеза. Фотосинтез - это процесс, благодаря которому растения превращают солнечный свет в энергию, используя углекислый газ из воздуха и воду из почвы. Хлорофилл поглощает световые волны, которые имеют длину 430-450 нм (синий цвет) и 680-720 нм (красный цвет). Эти волны поглощаются и перерабатываются в энергию, которая используется для синтеза глюкозы - основного источника энергии для растений.

Зеленый цвет травы обусловлен тем, что хлорофилл имеет максимальную поглощенную длину волны около 510-560 нм, что соответствует зеленому цвету. В то же время, другие пигменты, такие как каротин и антоцианы, также могут присутствовать в растениях и влиять на их цвет.

Таким образом,

In [6]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization",
)

In [4]:
%%time
prompt = """Задание содержит вопрос по теме Математика и 4 варианта ответа A, B, C, D, из которых только один правильный. Выберите букву правильного ответа:
Чему равен корень из 144?
A 14
B 12
C 4
D 44
Ответ:"""
generate(model, tokenizer, prompt, generation_config)

CPU times: user 3.82 s, sys: 51 ms, total: 3.87 s
Wall time: 3.8 s


'C 4'

In [9]:
generation_config

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_new_tokens": 1536,
  "no_repeat_ngram_size": 15,
  "pad_token_id": 0,
  "repetition_penalty": 1.1,
  "temperature": 0.2,
  "top_k": 40,
  "top_p": 0.9
}

In [6]:
%%time
prompt = """Задание содержит вопрос по теме Математика и 4 варианта ответа A, B, C, D, из которых только один правильный. Выберите букву правильного ответа:
Чему равен корень из 144?
A 14
B 12
C 45
D 44
Ответ:"""
data = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
data = {k: v.to(model.device) for k, v in data.items()}
with torch.no_grad():
    output = model.generate(
        **data,
        # generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True
    )

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


CPU times: user 774 ms, sys: 5.22 ms, total: 779 ms
Wall time: 777 ms


In [7]:
output.keys()

odict_keys(['sequences', 'scores'])

In [12]:
output.sequences[0].shape

torch.Size([94])

In [8]:
output.scores[0][0][330]

tensor(12.8750, device='cuda:0')

In [27]:
 tokens_of_interest = [
    tokenizer("A", add_special_tokens=False).input_ids[-1],
    tokenizer("B", add_special_tokens=False).input_ids[-1],
    tokenizer("C", add_special_tokens=False).input_ids[-1],
    tokenizer("D", add_special_tokens=False).input_ids[-1],
]
print(tokens_of_interest)

probs = [output.scores[0][0][token_id].item() for token_id in tokens_of_interest]
probs
res = dict(zip(["A", "B", "C", "D"], probs))
res

[330, 365, 334, 384]


{'A': 12.875, 'B': 12.875, 'C': 12.5, 'D': 14.125}

In [99]:
output.scores.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [70]:
len(data["input_ids"][0])

93

In [26]:
print(len(output), output)
output_ids = output.sequences[0][len(data["input_ids"][0]):]
output_str = tokenizer.decode(output_ids, skip_special_tokens=True)
output_str

2 GreedySearchDecoderOnlyOutput(sequences=tensor([[ 6369,  1225,  2676,  2573, 11078,  3139, 28786,  2890,  8378, 28788,
          1051,  3882,  2084,  5564,  1078, 15481,   917,   839, 28705, 28781,
         13524,   892,  2239,   946,  2433, 17835,   330, 28725,   365, 28725,
           334, 28725,   384, 28725,  2879, 28202, 24125, 24193, 18534,  1049,
          4086, 28723, 24311,  5744,   892,  1078,  5213, 28795,  6725, 18534,
         22821,  2433, 17835, 28747,    13, 28909, 28773,  2953,  2101, 16227,
          1619,   800,  7934,  2879, 28705, 28740, 28781, 28781, 28804,    13,
         28741, 28705, 28740, 28781,    13, 28760, 28705, 28740, 28750,    13,
         28743, 28705, 28781, 28782,    13, 28757, 28705, 28781, 28781,    13,
         28874, 28786,  8496, 28747,   384]], device='cuda:0'), scores=(tensor([[ -7.8125,  -7.2188,   8.4375,  ...,  -2.1250, -10.0000, -11.8750]],
       device='cuda:0'),), attentions=None, hidden_states=None)


'D'

In [34]:
predictions = torch.nn.functional.softmax(transition_scores[0])
predictions

  predictions = torch.nn.functional.softmax(transition_scores[0])


tensor([1.], device='cuda:0')

### Helper inference functions

In [4]:
def calculate_next_token_probs(q, tokenizer, model):
    data = tokenizer(q, return_tensors="pt", add_special_tokens=False)
    data = {k: v.to(model.device) for k, v in data.items()}
    with torch.no_grad():
        output = model.generate(
            **data,
            return_dict_in_generate=True,
            output_scores=True
        )
    
    tokens_of_interest = [
        tokenizer("A", add_special_tokens=False).input_ids[-1],
        tokenizer("B", add_special_tokens=False).input_ids[-1],
        tokenizer("C", add_special_tokens=False).input_ids[-1],
        tokenizer("D", add_special_tokens=False).input_ids[-1],
    ]

    probs = [output.scores[0][0][token_id].item() for token_id in tokens_of_interest]
    res = dict(zip(["A", "B", "C", "D"], probs))
    return res

In [9]:
probs = calculate_next_token_probs("A B ", tokenizer, model)
print(probs)
get_answer(probs)

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


{'A': 5.15625, 'B': 3.953125, 'C': 7.40625, 'D': 4.3125}


'C'

### Evalutate ru metrics on Saiga

In [11]:
from datasets import load_from_disk

In [None]:
for name, dataset_meta in tqdm(HUGGINGFACE_NAME_TO_DATASET.items(), desc="Datasets..."):
        for subset, split in tqdm(zip(dataset_meta["subsets"], dataset_meta["splits"]), desc="Splits..."):
            path = get_dataset_path(subset, name, split)
            dataset = load_from_disk(path)
            probs_list = []
            a_list = []
            for row in tqdm(dataset, desc="Rows..."):
                q = construct_prompt(row)
                probs = calculate_next_token_probs(q, tokenizer, model)
                probs_list.append({
                    "probs": probs,
                    "meta": row["meta"],
                })
                a = get_answer(probs)
                a_list.append({
                    "answer": a,
                    "meta": row["meta"],
                })
            metric_dir_path = get_metric_dir_path(SAIGA_MISTRAL_7B_LORA, subset, name, split)
            metric_dir_path.mkdir(exist_ok=True, parents=True)
            with open(metric_dir_path.joinpath("probs.jsonl"), "w") as f:
                json.dump(probs_list, f, ensure_ascii=False, indent=2)
            with open(metric_dir_path.joinpath("answers.jsonl"), "w") as f:
                json.dump(a_list, f, ensure_ascii=False, indent=2)

Datasets...:   0%|          | 0/1 [00:00<?, ?it/s]
Splits...: 0it [00:00, ?it/s][A

Rows...:   0%|          | 0/10033 [00:00<?, ?it/s][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 1/10033 [00:01<2:57:24,  1.06s/it][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 2/10033 [00:02<2:52:46,  1.03s/it][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 3/10033 [00:03<2:58:14,  1.07s/it][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 4/10033 [00:04<2:59:04,  1.07s/it][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 5/10033 [00:05<2:54:55,  1.05s/it][A[ASetting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


Rows...:   0%|          | 6/10033 [00:06<2:52:13,  1.03s/it][A[ASetting `pad_token_