# Load dependencies

In [1]:
import json
import os
import torch

from datasets import Dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging,
    MistralConfig,
    pipeline,
    TrainingArguments
)
from trl import SFTTrainer

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
# with open(train_data_path, 'r') as file:
#     training_data = json.load(file)
# dataset = Dataset.from_dict(training_data)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# sample_lengths = []
# for sample in dataset:
#     sample_lengths.append(len(tokenizer(sample['text'])['input_ids']))
# print(max(sample_lengths))

# Configure fine-tuning parameters

In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False,
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=2,
    lora_dropout=0.001,
    bias='none',
    task_type="CAUSAL_LM"
)

training_arguments = TrainingArguments(
    output_dir=os.path.join('fine_tuning', 'output'),
    num_train_epochs=100,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim='adamw_torch',
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0,
    fp16=False,
    bf16=True,
    max_grad_norm=1.0,
    warmup_ratio=0.02,
    lr_scheduler_type='linear',
    report_to="tensorboard"
)

train_data_path = os.path.join('data', 'fine_tuning', 'data.json')
training_data = None
with open(train_data_path, 'r') as file:
    training_data = json.load(file)
dataset = Dataset.from_dict(training_data)

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",
                                             config=MistralConfig,
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.bfloat16,
                                             low_cpu_mem_usage=True,
                                             device_map={"":0}
                                            )
model.config.use_cache=False
model=prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

sample_lengths = []
for sample in dataset:
    sample_lengths.append(len(tokenizer(sample['text'])['input_ids']))
max_seq_length = max(sample_lengths)
print("max_seq_length: {}".format(max_seq_length))

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True
)
    

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

max_seq_length: 5048


Generating train split: 0 examples [00:00, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# Fine-tune the model

In [3]:
trainer.train()
trainer.model.save_pretrained(os.path.join('fine_tuning','fine_tuned_models'))

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,2.2248
200,2.132
300,2.0095
400,1.9773
500,1.9581
600,1.9164
700,1.9351
800,1.905
900,1.8953
1000,1.9006


Checkpoint destination directory fine_tuning/output/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuning/output/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuning/output/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuning/output/checkpoint-4000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory fine_tuning/output/checkpoint-5000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


# Display fine-tuning results in tensorboard

In [1]:
import tensorboard
%load_ext tensorboard
%tensorboard --logdir 'fine_tuning/output'