In [1]:
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from trl import setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
path_to_source_model = "C:\\Users\\USER_ELISEY\\gemma"
path_to_finetuned_model = "C:\\Users\\USER_ELISEY\\hakaton\\russia_chad\\checkpoint-500"

In [3]:
@dataclass
class Config:
#     model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#     model_name = "AnatoliiPotapov/T-lite-instruct-0.1"
    model_name = "google/gemma-2-9b-it"
    dataset_name = "ruslanmv/ai-medical-chatbot"
    new_model = "llama-3.1-8b-chat-doctor"
    torch_dtype = torch.float16
    attn_implementation = "eager"
cfg = Config()

# Compare models

In [4]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
casual_model = AutoModelForCausalLM.from_pretrained(
    path_to_source_model,
    quantization_config=bnb_config,
#     device_map="auto",
    attn_implementation="eager"
)

tokenizer = tokenizer = AutoTokenizer.from_pretrained(path_to_source_model)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad_token|>'

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.06s/it]


In [5]:
casual_model, tokenizer = setup_chat_format(casual_model, tokenizer)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    path_to_finetuned_model,
    quantization_config=bnb_config,
#     device_map="auto",
    attn_implementation="eager"
)

tokenizer = tokenizer = AutoTokenizer.from_pretrained(path_to_finetuned_model)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad_token|>'

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.04s/it]


In [7]:
model, tokenizer = setup_chat_format(model, tokenizer)

## Get answers

In [8]:
def generate_answer(model, prompt):
    chat = [
        { "role": "user", "content": prompt },
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)

    return(tokenizer.decode(outputs[0]))

# Comprasion

In [9]:
q1 = "in what year did Alexander Nevsky go to the Horde?"
q2 = "Who is Stalin son?"
q3 = "Кто такой Брежнев?"

In [10]:
generate_answer(model, q1)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


'<|im_start|>user\nin what year did Alexander Nevsky go to the Horde?<|im_end|>\n<|im_start|>assistant\n<end_of_turn>\n<start_of_turn><start_of_turn><eos>\n<eos>\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n'

In [11]:
generate_answer(model, q2)

'<|im_start|>user\nWho is Stalin son?<|im_end|>\n<|im_start|>assistant\n<end_of_turn>\n<end_of_turn><eos>\n<eos>\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.'

In [12]:
generate_answer(model, q3)

'<|im_start|>user\nКто такой Брежнев?<|im_end|>\n<|im_start|>assistant\n<end_of_turn>\n<start_of_turn><start_of_turn><eos>\n<end_of_turn><eos>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<eos>\n<end_of_turn>\n<eos>\n<eos>\n<end_of_turn>\n<eos>\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•\n•'

In [13]:
# # Free gpu memory
# import numba
# numba.cuda.close()

In [14]:
print(generate_answer(casual_model, q1))

<|im_start|>user
in what year did Alexander Nevsky go to the Horde?<|im_end|>
<|im_start|>assistant
<end_of_turn>







<end_of_turn><eos><eos>



<end_of_turn><eos>



<end_of_turn><eos>



<end_of_turn><eos>



<end_of_turn><eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>



<end_of_turn>
<eos>


In [15]:
generate_answer(casual_model, q2)

'<|im_start|>user\nWho is Stalin son?<|im_end|>\n<|im_start|>assistant\n<end_of_turn><end_of_turn><eos><end_of_turn><eos><end_of_turn>.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n.\n\n\n\n'

In [16]:
generate_answer(casual_model, q3)

'<|im_start|>user\nКто такой Брежнев?<|im_end|>\n<|im_start|>assistant\n<end_of_turn>\n\n\n\n\n\n\n\n\n\n\n\n<end_of_turn><eos>\n\n\n\n<eos>\n<end_of_turn><eos>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n<end_of_turn>\n<eos>\n'