# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
ROOT_DIR = "/content/drive/My Drive/AlzterBot"
DATA_FILE = os.path.join(ROOT_DIR, "data_combined.json")
os.path.exists(DATA_FILE)

In [None]:
# # bitsandbytes had to be bumped to 0.45.2 to avoid errors in Colab env
# !pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4 trl

In [None]:
# If you're running on Colab
!pip install datasets bitsandbytes trl

In [None]:
import os
import torch
from datasets import load_dataset, DatasetDict
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

# Setup Training
## Model

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(repo_id,
    trust_remote_code=True,
    device_map="cuda:0",
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)

## Dataset

In [None]:
dataset = load_dataset("json", data_files=DATA_FILE, split="train")
#if type(dataset) is DatasetDict: dataset = dataset["train"]

# Obtain approx. 500 samples to train on
dataset = dataset.shard(24, 0)

In [None]:
dataset

## Training Configuration

In [None]:
FINETUNED_LLM_PATH = os.path.join(ROOT_DIR, "models/Test1")

LORA_RANK_DIMENSION = 6 # the rank of the adapter, the lower the fewer parameters you'll need to train. (smaller = more compression)
LORA_ALPHA = 8 # this is the scaling factor for LoRA layers (higher = stronger adaptation)
LORA_DROPOUT = 0.05 # dropout probability for LoRA layers (helps prevent overfitting)
MAX_SEQ_LENGTH = 64
EPOCHS=1
LEARNING_RATE=2e-4

In [None]:
lora_config = LoraConfig(
    r=LORA_RANK_DIMENSION,
    lora_alpha=LORA_ALPHA,
    bias="none",
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj']
)

In [None]:
from trl import SFTConfig

sft_config = SFTConfig(
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    auto_find_batch_size=True,

    max_seq_length=MAX_SEQ_LENGTH,
    packing=True,

    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    optim='adamw_torch_fused',
    warmup_ratio=0.03,
    lr_scheduler_type="constant",

    logging_steps=10,
    logging_dir='./logs',
    output_dir=FINETUNED_LLM_PATH,
    report_to='none'
)

# Prepare for Training

In [None]:
model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)

In [None]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

# Train

In [None]:
history = trainer.train()

In [None]:
from matplotlib import pyplot as plt
from pandas import DataFrame
import math

def save_training_history(history : DataFrame, output_dir : str):
    """Export training history of a fine-tuned LLM as a CSV and as a plot.

    Args:
        history (DataFrame) : LLM fine-tuning history retrieved from finetune()
        output_dir (str) : Path to store results.
    """
    # Save the training history
    history.to_csv(os.path.join(output_dir, "loss_history.csv"), index=False)

    # Plot the training history and save the plot
    plt.plot(history.set_index("step")["loss"])
    plt.xlabel("Epoch")
    plt.ylabel("Training Loss")

    loss_max = math.ceil(history['loss'].max())
    plt.ylim([0, loss_max])

    plt.title("Fine-tuning Training History")

    path = os.path.join(output_dir, "loss_history.png")
    plt.savefig( path, dpi=200, bbox_inches='tight' )

In [None]:
import pandas as pd

result = pd.DataFrame(trainer.state.log_history)
save_training_history(result, ROOT_DIR)

In [None]:
trainer.save_model(os.path.join(ROOT_DIR, "model_test"))

In [None]:
print("THANK YOU.")