### Libraries Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Model Initialization

In [None]:
from unsloth import FastLanguageModel
import torch

# default values chosen by Unsloth for us!
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! (maximum number of tokens (words/subwords) the model can process at once)
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

llm, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose  "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

"*LoRA (Low-Rank Adaptation of Large Language Models) is a popular and lightweight training technique that significantly reduces the number of trainable parameters. It works by inserting a smaller number of new weights into the model and only these are trained. This makes training with LoRA much faster, memory-efficient, and produces smaller model weights (a few hundred MBs), which are easier to store and share.*" - HuggingFace

We can modify the following numbers to increase accuracy, but also counteract over-fitting.

## Some parameters definition:
- **r**: The rank of the finetuning process. A larger number uses more memory and will be slower, but can increase accuracy on harder tasks. We normally suggest numbers like 8 (for fast finetunes), and up to 128. Too large numbers can causing over-fitting, damaging your model's quality.
- **target_modules**: Select which parts of the models should be modified by Lora. We select the most important and sensitive modules in transformer models because by updating only these, we can adapt the model to new tasks without changing everything (making it much lighter!).
- **lora_alpha**: The scaling factor for finetuning. A larger number will make the finetune learn more about your dataset, but can promote over-fitting. We suggest this to equal to the rank r, or double it.

In [None]:
# default parameters for LoRA (peft=Parameter Efficient Fine-Tuning)
model = FastLanguageModel.get_peft_model(
    llm,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # attention mechanisms modules
                      "gate_proj", "up_proj", "down_proj",], # feed-forward modules
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context and reduce memory usage by an extra 30%
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# default parameters for LoRA (peft=Parameter Efficient Fine-Tuning)
model_ft = FastLanguageModel.get_peft_model(
    llm,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # attention mechanisms modules
                      "gate_proj", "up_proj", "down_proj",], # feed-forward modules
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context and reduce memory usage by an extra 30%
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Test base LLama model with generic question

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "What is fibonacci serie?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 0.7, min_p = 0.1)

### Test base LLama model with company personal information question

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)
FastLanguageModel.for_inference(model_ft) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "How can I request a new company laptop?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model_ft.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 0.7, min_p = 0.1)

### Load dataset

In [None]:
import json
import pandas as pd

# Specify the path to your JSONL file
jsonl_file_path = 'company_internal_processes_en.jsonl'

data = []
with open(jsonl_file_path, 'r') as f:
  for line in f:
    data.append(json.loads(line))

df = pd.DataFrame(data)

In [None]:
df.head(10)

### Format it for fine-tuning

In [None]:
from datasets import Dataset
df["conversations"] = df.apply(
    lambda x: [
        {"content": x["question"], "role": "user"},
        {"content": x["answer"], "role": "assistant"}
    ], axis=1
)

# drop old columns since we now have a single column containing both question and answer formatted as needed
dataset = Dataset.from_pandas(df.drop(columns=["question", "answer"]))

In [None]:
dataset["conversations"][:3] # list of lists of dictionaries

### Format conversation column adding tags used to train LLama model

In [None]:
# Format the conversations column into a single string using the tokenizer's chat template
def format_conversations(example):
    # Apply the chat template to the list of messages
    # The tokenizer handles the list of dicts and outputs a single formatted string
    example["formatted_conversations"] = tokenizer.apply_chat_template(example["conversations"], tokenize=False, add_generation_prompt=False)
    return example

# Apply the formatting function to the dataset
dataset = dataset.map(format_conversations, num_proc=2)

In [None]:
dataset["conversations"][0]

In [None]:
dataset["formatted_conversations"][0] # string with Llama 3.2 tags separators

### Train the model!
We will use HuggingFace **TRL's SFTTrainer** (Transformer Reinforcement Learning - Supervised Fine Tuning).
- TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT) and others.
- Supervised fine-tuning (SFT) is the most common step in post-training foundation models, and also one of the most effective.

We do 40 steps to speed things up, but we can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model_ft,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "formatted_conversations",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 40, # for faster training
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    )
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

### Let's train our model to let it focus on answers only, in order to let it understand the knowledge of our company, disreguarding the types of question that might be asked.


The dataset consists of question-answer pairs related to company procedures (e.g., requesting a laptop, reporting GDPR violations).

- **Focus on Assistant Responses**: The dataset’s responses hold key procedural knowledge (e.g., forms, emails, approvals). Training only on responses ensures the model learns output style and content without processing redundant questions.

- **Computational Efficiency**: Excluding questions reduces memory usage and speeds up fine-tuning, leveraging Unsloth’s optimizations for up to 2x faster training.




In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer_on_responses_only = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
trainer_on_responses_only_stats = trainer_on_responses_only.train()

### Let's test the new fine-tuned model!

In [None]:
FastLanguageModel.for_inference(model_ft) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "How can I request a new company laptop?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model_ft.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024,
                   use_cache = True, temperature = 0.7, min_p = 0.1)

### Export the fine-tuned model!

In [None]:
model_ft.save_pretrained_gguf("./model", tokenizer, quantization_method = "f16")

# Save to q4_k_m GGUF
# model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")

# Save to 8bit Q8_0
# model.save_pretrained_gguf("model", tokenizer,)

In [None]:
import os
from google.colab import files

folder_path = 'model'
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
      if file_path.endswith("unsloth.F16.gguf"):
        files.download(file_path)
