In [1]:
!pip install -r requirements.txt

Processing /wheels/flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl (from -r requirements.txt (line 36))
flash_attn is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0m

In [2]:
import os
if "TRANSFORMERS_CACHE" in os.environ and "HF_HOME" not in os.environ:
    os.environ["HF_HOME"] = os.environ["TRANSFORMERS_CACHE"]
    del os.environ["TRANSFORMERS_CACHE"]

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import peft
from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training
import torch
from datasets import load_dataset
import os
from transformers import TrainingArguments, BitsAndBytesConfig
import transformers
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig



In [4]:
def print_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        print(f'VRAM allocated {allocated}gb, reserved {reserved}gb')
    else:
        print('No cuda')

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [6]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# гарантируем eos_token_id
if tokenizer.eos_token_id is None and tokenizer.eos_token is not None:
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

foundation_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                        quantization_config=bnb_config,
                                                        device_map="auto",
                                                        attn_implementation="flash_attention_2")
foundation_model = prepare_model_for_kbit_training(foundation_model)
foundation_model.gradient_checkpointing_enable()
foundation_model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print("chat_template exists:", bool(getattr(tokenizer, "chat_template", None)))
print("chat_template preview:\n", tokenizer.chat_template)

chat_template exists: True
chat_template preview:
 {%- if messages[0]["role"] == "system" %}
    {%- set system_message = messages[0]["content"] %}
    {%- set loop_messages = messages[1:] %}
{%- else %}
    {%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}
{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}

{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}
{%- set ns = namespace() %}
{%- set ns.index = 0 %}
{%- for message in loop_messages %}
    {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}
        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}
        {%- endif %}
        {%- set ns.

In [8]:
def get_outputs(model, inputs, tokenizer, max_new_tokens=1000):
    model.eval()
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            repetition_penalty=1.5,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
        return outputs

In [9]:
print("bf16 supported:", torch.cuda.is_available() and torch.cuda.is_bf16_supported())

bf16 supported: True


In [10]:
print_memory()

VRAM allocated 4.355072975158691gb, reserved 4.859375gb


In [11]:
input_simple_sentences_text = 'I have high temperature. Which pills should I take? Answer with pills, no additional info. Pills:'
input_simple_sentences = tokenizer(input_simple_sentences_text, return_tensors='pt').to(foundation_model.device)
foundational_outputs_sentence = get_outputs(foundation_model, input_simple_sentences, tokenizer)
output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


In [12]:
output[0]

'I have high temperature. Which pills should I take? Answer with pills, no additional info. Pills: Tylenol (Acetaminophen) or Advil/Motrin (Ibuprofen). Both can help reduce fever and relieve pain associated with it. Always consult a doctor for proper diagnosis before self-medicating.'

In [13]:
input_json_sentences_text = 'I have high temperature. Which pills should I take? Answer with json, contining pills. Example ["carbamazepine", "lamotrigine"]. Your answer in json: '
input_json_sentences = tokenizer(input_json_sentences_text, return_tensors='pt').to(foundation_model.device)
foundational_outputs_sentence = get_outputs(foundation_model, input_json_sentences, tokenizer)
output = tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True)

In [14]:
output[0]

'I have high temperature. Which pills should I take? Answer with json, contining pills. Example ["carbamazepine", "lamotrigine"]. Your answer in json:  {\n"pills": [   "ibuprofen",    "acetaminophen"] }'

# Начинаем готовить LORA.

In [15]:
dataset_name = 'mlabonne/FineTome-100k'

In [16]:
data = load_dataset(dataset_name)
data

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score'],
        num_rows: 100000
    })
})

In [17]:
def convert_to_openai_format(sample):
    messages=[]

    conversations = sample['conversations']

    for speech in conversations:
        if speech['from'] == 'human':
            messages.append({'role': 'user', 'content': speech['value']})
        elif speech['from'] == 'gpt':
            messages.append({'role': 'assistant', 'content': speech['value']})

    return {'openai_dialog': messages}
    

In [18]:
data = data.map(convert_to_openai_format)

In [19]:
data

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score', 'openai_dialog'],
        num_rows: 100000
    })
})

In [20]:
data['train']['openai_dialog'][4]

[{'content': 'Print the reverse of a string using a for loop.',
  'role': 'user'},
 {'content': 'Here is an example code using a for loop to print the reverse of a string along with a detailed docstring:\n\n```python\ndef print_reverse_string(string):\n    """\n    Prints the reverse of a given string using a for loop.\n\n    Parameters:\n    string (str): The string to be reversed and printed.\n\n    Returns:\n    None.\n    """\n\n    # Iterate through the characters of the string in reverse order\n    for i in range(len(string)-1, -1, -1):\n        print(string[i], end=\'\')\n\n    # Print a new line after printing the whole reversed string\n    print()\n\n# Example usage\ninput_string = input("Enter a string: ")\nprint_reverse_string(input_string)\n```\n\nIn this code, the `print_reverse_string` function takes a string as input. It uses a for loop to iterate through the characters of the string in reverse order. The for loop starts from the index of the last character (`len(string)

In [21]:
def render_chat(sample):
    tokenized = tokenizer.apply_chat_template(sample['openai_dialog'], tokenize=False, add_generation_prompt=False, add_special_tokens=False)
    return {'text': tokenized}

In [22]:
data = data.map(render_chat)

In [23]:
data

DatasetDict({
    train: Dataset({
        features: ['conversations', 'source', 'score', 'openai_dialog', 'text'],
        num_rows: 100000
    })
})

In [24]:
data['train']['text'][5]

'<s>[INST] How do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?[/INST] Astronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.</s>'

In [25]:
lora_config = LoraConfig(
    r=8, # As bigger the R bigger the parameters to train.
    lora_alpha=16, # a scaling factor that adjusts the magnitude of the weight matrix. Usually set to 1
    target_modules=["q_proj","k_proj","v_proj","o_proj", 'gate_proj', 'up_proj', 'down_proj'],
    #target_modules=["q_proj","k_proj","v_proj","o_proj"],
    #modules_to_save=["embed_tokens", "lm_head"],
    lora_dropout=0.05, # Helps to avoid Overfitting.
    #bias="lora_only", # this specifies if the bias parameter should be trained.
    bias="none",
    task_type="CAUSAL_LM",
)

In [26]:
peft_model = get_peft_model(foundation_model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 7,268,995,072 || trainable%: 0.2885


In [27]:
working_dir = './'

output_directory = os.path.join(working_dir, 'peft_lab_outputs')

In [28]:
response_ids = tokenizer.encode("[/INST]", add_special_tokens=False)

collator = DataCollatorForCompletionOnlyLM(
    response_template=response_ids,
    tokenizer=tokenizer,
)

In [29]:
ds = data["train"]  # исходный Dataset

# 1) отделяем test (1%)
tmp = ds.train_test_split(test_size=0.01, seed=42, shuffle=True)
train_val = tmp["train"]
test_dataset = tmp["test"]

# 2) отделяем val (1% от исходного => 1% / 99% от оставшегося)
val_ratio_of_train_val = 0.01 / 0.99
tmp2 = train_val.train_test_split(test_size=val_ratio_of_train_val, seed=42, shuffle=True)
train_dataset = tmp2["train"]
eval_dataset = tmp2["test"]   # validation

print(len(train_dataset), len(eval_dataset), len(test_dataset))

97999 1001 1000


In [30]:
train_dataset = train_dataset.shuffle(seed=42).select(range(5000))

In [31]:
train_dataset

Dataset({
    features: ['conversations', 'source', 'score', 'openai_dialog', 'text'],
    num_rows: 5000
})

In [32]:
eval_dataset = eval_dataset.select(range(250))
eval_dataset

Dataset({
    features: ['conversations', 'source', 'score', 'openai_dialog', 'text'],
    num_rows: 250
})

In [33]:
test_dataset = test_dataset.select(range(250))
test_dataset

Dataset({
    features: ['conversations', 'source', 'score', 'openai_dialog', 'text'],
    num_rows: 250
})

In [43]:
sft_config = SFTConfig(
    output_dir=output_directory,
    dataset_text_field="text",
    max_seq_length=2048,

    auto_find_batch_size=True,
    learning_rate=5e-5,
    num_train_epochs=2,
    bf16=True,
    logging_steps=10,
    save_steps=200,
    eval_strategy="steps",
    eval_steps=200,
    save_total_limit=2,
    report_to="none",
)

In [44]:
trainer = SFTTrainer(
    model = peft_model,
    train_dataset = train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collator,
    tokenizer=tokenizer,
    args=sft_config,
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [36]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
200,0.6106,0.610538
400,0.66,0.599811
600,0.5977,0.592914
800,0.6549,0.589346
1000,0.5823,0.585565
1200,0.5584,0.583499
1400,0.5837,0.589304
1600,0.4866,0.587857
1800,0.5112,0.588534
2000,0.5097,0.588607


TrainOutput(global_step=2500, training_loss=0.546628126335144, metrics={'train_runtime': 20808.5952, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.12, 'total_flos': 4.3589237244542976e+17, 'train_loss': 0.546628126335144, 'epoch': 2.0})

In [45]:
test_trainer = SFTTrainer(
    model=trainer.model,
    train_dataset=None,
    eval_dataset=test_dataset,     # сырой с text
    data_collator=collator,
    tokenizer=tokenizer,
    args=sft_config,
)

test_metrics = test_trainer.evaluate()
print(test_metrics)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

{'eval_loss': 0.619405210018158, 'eval_model_preparation_time': 0.0031, 'eval_runtime': 170.0675, 'eval_samples_per_second': 1.47, 'eval_steps_per_second': 0.188}


In [47]:
adapter_dir = "./peft_lab_outputs/lora_adapter_1"

peft_model.save_pretrained(adapter_dir)   # сохраняет только веса адаптера + config
tokenizer.save_pretrained(adapter_dir)    # важно, если добавляли спец-токены

('./peft_lab_outputs/lora_adapter_1/tokenizer_config.json',
 './peft_lab_outputs/lora_adapter_1/special_tokens_map.json',
 './peft_lab_outputs/lora_adapter_1/tokenizer.model',
 './peft_lab_outputs/lora_adapter_1/added_tokens.json',
 './peft_lab_outputs/lora_adapter_1/tokenizer.json')

In [48]:
peft_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralFlashAttention2(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k

In [49]:
test_dataset.save_to_disk('test_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]