### Fine Tuning - Evaluation : Cycle 3
- Using Unsloth for faster training and resource saving
- Code is a slightly modified version of the code provided in the Unsloth Github Repository
- Evaluating perplexity (Automated evaluation metrics)
- Perplexity is calculated using Huggingface
- Hallucination is calculated using SelfCheckGPT score/s

##### Installation and GDrive Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

##### Fine-Tuning Setup
- Set model
- Set dataset

In [None]:
from unsloth import FastLanguageModel
import torch
# ------------------------------------------------------------------------------------------
## Setup
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# ------------------------------------------------------------------------------------------
## Load Llama 3.1
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
# ------------------------------------------------------------------------------------------
## Default Unsloth Settings
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
# ------------------------------------------------------------------------------------------
## Data Loading and Alpaca Prompt Setup (Default Prompt Setup for Llama 3.1)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset_location = "/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/Cycle2_dataset - train reformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
# ------------------------------------------------------------------------------------------
## Perplexity Test Setups
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

import pandas as pd
from datasets import Dataset
# Test Dataset
test_df = pd.read_csv("/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/cycle2_test_reformatted.csv")
# Combine the columns into a single text input for the model
test_df["text"] = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{test_df["instruction"]}

### Input:
{test_df["input"]}

### Response:
{test_df["output"]}"""
# Create Test Dataset
test_dataset = Dataset.from_pandas(test_df[["text"]])
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_datasets = test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
# Evaluate Perplexity using Torch
def compute_perplexity(model, dataset, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    dataloader = DataLoader(tokenized_datasets, collate_fn=data_collator, batch_size=8)
    model.eval()
    total_loss = 0
    total_length = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = batch["input_ids"].to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item() * inputs.size(1)
            total_length += inputs.size(1)

    avg_loss = total_loss / total_length
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

##### Training -> Evaluation -> Saving Sets
- Set Trainer Parameter Permutations
- Run Training -> Evaluation (Perplexity) -> Save Model with Parameter Permutations Note

In [None]:
## Trainer Setup Imports
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

def train_evaluate_sets(used_parameters):
    parameters = f"{used_parameters[0]}_{used_parameters[1]}_{used_parameters[2]}_{used_parameters[3]}_{used_parameters[4]}"
    #------------------------------------------------------------------------------------------
    ###################
    ## Trainer Setup ##
    ###################
    #------------------------------------------------------------------------------------------
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = used_parameters[0],
            gradient_accumulation_steps = used_parameters[1],
            warmup_steps = used_parameters[2],
            # num_train_epochs = 1, # Set this for 1 full training run.
            max_steps = used_parameters[3],
            learning_rate = used_parameters[4],
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none", # Use this for WandB etc
        ),
    )
    #------------------------------------------------------------------------------------------
    #####################
    ## Training Proper ##
    #####################
    #------------------------------------------------------------------------------------------
    # @title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    # @title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
    #------------------------------------------------------------------------------------------
    ########################
    ## Perplexity Testing ##
    ########################
    #------------------------------------------------------------------------------------------
    perplexity = compute_perplexity(model, test_dataset, tokenizer)
    #------------------------------------------------------------------------------------------
    print(f"Parameters: {parameters} Perplexity: {perplexity}")
    rounded = f"{round(perplexity)}"
    ############
    ## Saving ##
    ############
    #------------------------------------------------------------------------------------------
    save_name = f"{parameters}_{rounded}"
    model.save_pretrained(f"{save_name}_lora_model")
    tokenizer.save_pretrained(f"{save_name}_lora_model")

In [None]:
## Train several llama3.1 models based on parameter settings

SETTING = 0
# Trainer Parameters SET 0:
#   learning rate: 2e-4 | 1e-4
#   per_device_train_batch_size : 2
#   gradient_accumulation_steps : 4 | 8
#   warmup_steps : 5
#   max_steps : 30 | 60
#   total permuations: 2 x 1 x 2 x 1 x 2 = 8 permutations -> 15 x 8 -> 2 hrs worth of training time for 8 models

## Parameter Sets
parameter_sets = [
        [
            [2, 4, 5, 30, 2e-4],
            [2, 8, 5, 30, 2e-4],
            [2, 8, 5, 60, 2e-4],
            [2, 4, 5, 60, 2e-4],

            [2, 4, 5, 30, 1e-4],
            [2, 8, 5, 30, 1e-4],
            [2, 8, 5, 60, 1e-4],
            [2, 4, 5, 60, 1e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters)

##### Saving and Loading Model

In [None]:
## LORA Saving
if False:
    model.save_pretrained("lora_model")  # Local saving
    tokenizer.save_pretrained("lora_model")
    # model.push_to_hub("your_name/lora_model", token = "...") # Online saving
    # tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
## Loading LORA Model
if True:
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases/lora", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
# Download without losing runtime
import time

# 5 minutes
i = 0
while i<100:
  time.sleep(3)
  i+=1

##### Initial Testing

In [None]:
## Alpaca Prompt Copy
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Alpaca Prompt -> Output
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Generate a test case for the following UI Element: Link Element 'Euskara' With URL https://eu.wikipedia.org/wiki/ from the website: https://en.wikipedia.org/wiki/Main_Page", # instruction #
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")
output = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print(tokenizer.batch_decode(output))

# Expected Prompt -> Output
from transformers import TextStreamer
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Link Element 'Euskara' With URL https://eu.wikipedia.org/wiki/ from the website: https://en.wikipedia.org/wiki/Main_Page"
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
output = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

print(tokenizer.batch_decode(output))