### Fine Tuning - Evaluation : Cycle 3
- Using Unsloth for faster training and resource saving
- Code is a slightly modified version of the code provided in the Unsloth Github Repository
- Evaluating perplexity (Automated evaluation metrics)
- Perplexity is calculated using Huggingface
- Hallucination is calculated using SelfCheckGPT score/s

##### NOTE: Update the folder paths, depending on the location of the datasets in your Google Drive

##### Installation and GDrive Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

##### Fine-Tuning Setup
- Set model
- Set dataset
- Set perplexity evaluation procedure

In [None]:
from unsloth import FastLanguageModel
import torch
# ------------------------------------------------------------------------------------------
## Setup
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# ------------------------------------------------------------------------------------------
## Load Llama 3.1
main_model, main_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
# ------------------------------------------------------------------------------------------

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.10: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
## Data Loading and Alpaca Prompt Setup (Default Prompt Setup for Llama 3.1)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = main_tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

print(f"EOS TOKEN : {EOS_TOKEN}")

EOS TOKEN : <|end_of_text|>


In [None]:
# ------------------------------------------------------------------------------------------
## Perplexity Test Setups
from transformers import LlamaTokenizer, LlamaForCausalLM
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

import pandas as pd
from datasets import Dataset

def preprocess_function(examples):
    return main_tokenizer(examples["text"], truncation=True)

# Evaluate Perplexity using Torch
def compute_perplexity(model, dataset, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["output", "input", "instruction", "text"])
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    dataloader = DataLoader(tokenized_dataset, collate_fn=data_collator, batch_size=8)

    model.eval()
    total_loss = 0
    total_length = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=True):
            inputs = batch["input_ids"].to(device)
            outputs = model(inputs, labels=inputs)
            loss = outputs.loss
            total_loss += loss.item() * inputs.size(1)
            total_length += inputs.size(1)

    avg_loss = total_loss / total_length
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

##### Training -> Evaluation -> Saving Sets
- Set Trainer Parameter Permutations
- Run Training -> Evaluation (Perplexity) -> Save Model with Parameter Permutations Note

In [None]:
## Trainer Setup Imports
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from transformers import TrainerCallback

def train_evaluate_sets(used_parameters, main_model, main_tokenizer, dataset, test_dataset):
    parameters = f"{used_parameters[0]}_{used_parameters[1]}_{used_parameters[2]}_{used_parameters[3]}_{used_parameters[4]}"
    model = None
    tokenizer = None
    ## Default Unsloth Settings
    model = FastLanguageModel.get_peft_model(
        main_model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )
    tokenizer = main_tokenizer
    # ------------------------------------------------------------------------------------------
    #------------------------------------------------------------------------------------------
    ###################
    ## Trainer Setup ##
    ###################
    #------------------------------------------------------------------------------------------
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        # eval_dataset = eval_dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        # compute_metrics=compute_metrics,
        args = TrainingArguments(
            per_device_train_batch_size = used_parameters[0],
            gradient_accumulation_steps = used_parameters[1],
            warmup_steps = used_parameters[2],
            # num_train_epochs = 1, # Set this for 1 full training run.
            max_steps = used_parameters[3],
            learning_rate = used_parameters[4],
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none", # Use this for WandB etc
        ),
    )
    #------------------------------------------------------------------------------------------
    #####################
    ## Training Proper ##
    #####################
    #------------------------------------------------------------------------------------------
    # @title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    try:
        log_history = trainer.state.log_history
        for log in log_history:
            if "loss" in log:
                loss_value = log["loss"]
                training_loss = round(loss_value, 4)
    except Exception:
        pass

    # @title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
    #------------------------------------------------------------------------------------------
    ########################
    ## Perplexity Testing ##
    ########################
    #------------------------------------------------------------------------------------------
    # evaluation_results = compute_perplexity(model, test_dataset, tokenizer) #trainer.evaluate()
    perplexity = compute_perplexity(model, test_dataset, tokenizer)
    #------------------------------------------------------------------------------------------
    print(f"Parameters: {parameters} Perplexity: {perplexity}")
    rounded = f"{round(perplexity, 3)}"
    ############
    ## Saving ##
    ############
    #------------------------------------------------------------------------------------------
    try:
        save_name = f"{parameters}_{training_loss}_{rounded}"
    except Exception:
        save_name = f"{parameters}_{rounded}"
    model.save_pretrained(f"{save_name}_lora_model")
    tokenizer.save_pretrained(f"{save_name}_lora_model")

##### Final Cycle 3 Training: Full Model Training + Evaluation Cycles
- 1st Cycle : 1000 dataset. 800 train 200 test
  - Parameters:
    - 20 epochs, 2e learning rate, 2, 4
    - 30 epochs, 1e learning rate, 4, 8
- 2nd Cycle : 2000 dataset. 1600 train 400 test
  - Parameters:
    - 20 epochs, 2e learning rate, 2, 4
    - 30 epochs, 1e learning rate, 4, 8
- 3rd Cycle : 3000 dataset. 2400 train, 600 test
  - Parameters:
    - 20 epochs, 2e learning rate, 2, 4
    - 30 epochs, 1e learning rate, 4, 8
- Final Cycle : Max dataset. 80% train, 20% test
  - Parameters:
    - 20 epochs, 2e learning rate, 2, 4
    - 30 epochs, 1e learning rate, 4, 8

##### Individual Training Procedure

In [None]:
from datasets import load_dataset
FOLDER_SETTING = 0
folders = ["/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases", "/content/drive/MyDrive/Fine Tuning (Alt)"]
main_folder = folders[FOLDER_SETTING]

#############
## DATASET ##
#############
# - Filled Input
dataset_location = f"{main_folder}/cycle3_trainformatted.csv"
test_dataset_location = f"{main_folder}/cycle3_testformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

##############
## TRAINING ##
##############
SETTING = 0
parameter_sets = [
        [
            ################
            ## PARAMETERS ##
            ################
            # - learning rate: 2e-4 | 1e-4
            # - per_device_train_batch_size : 2
            # - gradient_accumulation_steps : 4
            # - warmup_steps : 5
            # - max_steps : 20 / 30
            ################
            # Learning Rate: change 2e-4 (which is 0.0002) to something lower or something higher if you want slower or faster speed
            # Steps/Epochs: change 20 to something lower (like )
            # Warmup Steps : no need to change 5, but feel free to adjust
            # The others are related to batches. might affect training speed when adjusted
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

In [None]:
## In case you want to do more individual training while saving the previous outputs, copypaste the above code here

In [None]:
## In case you want to do more individual training while saving the previous outputs, copypaste the above code here

In [None]:
## In case you want to do more individual training while saving the previous outputs, copypaste the above code here

##### Batch Training
- My Objective : Train and compare models of different sizes

In [None]:
## Train several llama3.1 models based on parameter settings

from datasets import load_dataset
FOLDER_SETTING = 0
folders = ["", "/content/drive/MyDrive/Fine-Tuning Llama 3.1 For Test Cases", "/content/drive/MyDrive/Fine Tuning (Alt)"]
main_folder = folders[FOLDER_SETTING]

#################
## SIZE : 1000 ##
#################
# - Filled Input
dataset_location = f"{main_folder}/cycle3_1000_trainformatted.csv"
test_dataset_location = f"{main_folder}/cycle3_1000_testformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

# - Clear Input
dataset_location = f"{main_folder}/cycle3_1000_train.csv"
test_dataset_location = f"{main_folder}/cycle3_1000_test.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [4, 8, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

#################
## SIZE : 2000 ##
#################
# - Filled Input
dataset_location = f"{main_folder}/cycle3_2000_trainformatted.csv"
test_dataset_location = f"{main_folder}/cycle3_2000_testformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

# - Clear Input
dataset_location = f"{main_folder}/cycle3_2000_train.csv"
test_dataset_location = f"{main_folder}/cycle3_2000_test.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

#################
## SIZE : 3000 ##
#################
# - Filled Input
dataset_location = f"{main_folder}/cycle3_3000_trainformatted.csv"
test_dataset_location = f"{main_folder}/cycle3_3000_testformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

# - Clear Input
dataset_location = f"{main_folder}/cycle3_3000_train.csv"
test_dataset_location = f"{main_folder}/cycle3_3000_test.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

################
## SIZE : MAX ##
################
# - Filled Input
dataset_location = f"{main_folder}/cycle3_trainformatted.csv"
test_dataset_location = f"{main_folder}/cycle3_testformatted.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)

# - Clear Input
dataset_location = f"{main_folder}/cycle3_train.csv"
test_dataset_location = f"{main_folder}/cycle3_test.csv"
dataset = load_dataset('csv', data_files=dataset_location, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
test_dataset = load_dataset('csv', data_files=test_dataset_location, split = "train")
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

SETTING = 0
SETTING = 0
parameter_sets = [
        [
           [2, 4, 5, 30, 1e-4],
           [2, 4, 5, 20, 2e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer, dataset, test_dataset)


##### Previous Cycle 3 Fine Tuning Section

##### Model Training + Evaluation Section (Part 1 of Cycle 3)

In [None]:
## Train several llama3.1 models based on parameter settings

SETTING = 0
# Trainer Parameters SET 0:
#   learning rate: 2e-4 | 1e-4
#   per_device_train_batch_size : 2
#   gradient_accumulation_steps : 4 | 8
#   warmup_steps : 5
#   max_steps : 30 | 45
#   total permuations: 2 x 1 x 2 x 1 x 2 = 8 permutations -> 15 x 8 -> 2 hrs worth of training time for 8 models

# 1e-4 requires higher steps, 2e-4 encounters significant jumpings early on, gradient accumulation 8 make loss so much lower

## Parameter Sets
parameter_sets = [
        [
           [2, 4, 5, 30, 2e-4], # Training Loss 0.490000 | Perplexity 9.076
           [2, 8, 5, 30, 2e-4], # Training Loss 0.511300 | Perplexity 8.901

           [2, 4, 5, 30, 1e-4], # Training Loss 0.586900 | Perplexity 9.763
           [2, 8, 5, 30, 1e-4], # Training Loss 0.604800 | Perplexity 9.470

          #  [2, 8, 5, 45, 2e-4],
          #  [2, 4, 5, 45, 2e-4],

          #  [2, 8, 5, 45, 1e-4],
          #  [2, 4, 5, 45, 1e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer)

GPU = Tesla T4. Max memory = 14.741 GB.
9.24 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7097
2,1.6263
3,1.6482
4,1.5672
5,1.4366
6,1.3298
7,1.2496
8,1.0576
9,0.904
10,0.7618


279.4277 seconds used for training.
4.66 minutes used for training.
Peak reserved memory = 9.729 GB.
Peak reserved memory for training = 0.489 GB.
Peak reserved memory % of max memory = 66.0 %.
Peak reserved memory for training % of max memory = 3.317 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [06:01<00:00,  2.68s/it]


Parameters: 2_4_5_30_0.0002 Perplexity: 9.076018333435059
GPU = Tesla T4. Max memory = 14.741 GB.
11.604 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.667
2,1.6598
3,1.6387
4,1.5845
5,1.4385
6,1.2865
7,1.1639
8,1.0352
9,0.9341
10,0.7362


554.9453 seconds used for training.
9.25 minutes used for training.
Peak reserved memory = 11.604 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 78.719 %.
Peak reserved memory for training % of max memory = 0.0 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [05:58<00:00,  2.65s/it]


Parameters: 2_8_5_30_0.0002 Perplexity: 8.901280403137207
GPU = Tesla T4. Max memory = 14.741 GB.
11.604 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7097
2,1.6263
3,1.6599
4,1.6194
5,1.5352
6,1.518
7,1.4903
8,1.2851
9,1.2045
10,1.1275


278.6319 seconds used for training.
4.64 minutes used for training.
Peak reserved memory = 11.604 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 78.719 %.
Peak reserved memory for training % of max memory = 0.0 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [05:58<00:00,  2.65s/it]


Parameters: 2_4_5_30_0.0001 Perplexity: 9.763179779052734
GPU = Tesla T4. Max memory = 14.741 GB.
11.604 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 30
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.667
2,1.6598
3,1.6519
4,1.6371
5,1.5444
6,1.4508
7,1.3899
8,1.301
9,1.2285
10,1.0705


558.7003 seconds used for training.
9.31 minutes used for training.
Peak reserved memory = 11.604 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 78.719 %.
Peak reserved memory for training % of max memory = 0.0 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [05:58<00:00,  2.66s/it]


Parameters: 2_8_5_30_0.0001 Perplexity: 9.470096588134766


In [None]:
## Train several llama3.1 models based on parameter settings

SETTING = 0
# Trainer Parameters SET 0:
#   learning rate: 2e-4 | 1e-4
#   per_device_train_batch_size : 2
#   gradient_accumulation_steps : 4 | 8
#   warmup_steps : 5
#   max_steps : 30 | 45
#   total permuations: 2 x 1 x 2 x 1 x 2 = 8 permutations -> 15 x 8 -> 2 hrs worth of training time for 8 models

# 1e-4 requires higher steps, 2e-4 encounters significant jumpings early on, gradient accumulation 8 make loss so much lower

## Parameter Sets
parameter_sets = [
        [
          #  [2, 4, 5, 30, 2e-4], # Training Loss 0.490000 | Perplexity 9.076
          #  [2, 8, 5, 30, 2e-4], # Training Loss 0.511300 | Perplexity 8.901

          #  [2, 4, 5, 30, 1e-4], # Training Loss 0.586900 | Perplexity 9.763
          #  [2, 8, 5, 30, 1e-4], # Training Loss 0.604800 | Perplexity 9.470

           [2, 8, 5, 45, 2e-4], # Training Loss: 0.428600 | Perplexity 8.487
           [2, 4, 5, 45, 2e-4], # Training Loss: 0.348300 | Perplexity 8.476

          #  [2, 8, 5, 45, 1e-4],
          #  [2, 4, 5, 45, 1e-4],
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer)

GPU = Tesla T4. Max memory = 14.741 GB.
11.604 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 45
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.667
2,1.6598
3,1.6387
4,1.5845
5,1.4385
6,1.2864
7,1.1639
8,1.0335
9,0.9305
10,0.7317


826.4144 seconds used for training.
13.77 minutes used for training.
Peak reserved memory = 11.604 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 78.719 %.
Peak reserved memory for training % of max memory = 0.0 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [05:58<00:00,  2.66s/it]


Parameters: 2_8_5_45_0.0002 Perplexity: 8.487053871154785
GPU = Tesla T4. Max memory = 14.741 GB.
11.604 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 45
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7097
2,1.6263
3,1.6482
4,1.5673
5,1.4366
6,1.3298
7,1.2495
8,1.0566
9,0.8999
10,0.7583


420.708 seconds used for training.
7.01 minutes used for training.
Peak reserved memory = 11.604 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 78.719 %.
Peak reserved memory for training % of max memory = 0.0 %.


Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

100%|██████████| 135/135 [05:58<00:00,  2.66s/it]


Parameters: 2_4_5_45_0.0002 Perplexity: 8.4759521484375


##### Part 2 of Cycle 3

In [None]:
## Train several llama3.1 models based on parameter settings

# Changes in Part 2
# - Input is re-adjusted as a potential solution to the high perplexity, high training loss, non-terminating results
SETTING = 1
# Trainer Parameters SET 0:
#   learning rate: 2e-4 | 1e-4
#   per_device_train_batch_size : 2 | 4
#   gradient_accumulation_steps : 4 | 8
#   warmup_steps : 5
#   max_steps : 45

# 1e-4 requires higher steps, 2e-4 encounters significant jumpings early on, gradient accumulation 8 make loss so much lower

## Parameter Sets
parameter_sets = [
        [
          #  [2, 4, 5, 30, 2e-4], # Training Loss 0.490000 | Perplexity 9.076
          #  [2, 8, 5, 30, 2e-4], # Training Loss 0.511300 | Perplexity 8.901

          #  [2, 4, 5, 30, 1e-4], # Training Loss 0.586900 | Perplexity 9.763
          #  [2, 8, 5, 30, 1e-4], # Training Loss 0.604800 | Perplexity 9.470

          #  [2, 8, 5, 45, 2e-4], # Training Loss: 0.428600 | Perplexity 8.487
          #  [2, 4, 5, 45, 2e-4], # Training Loss: 0.348300 | Perplexity 8.476

          #  [2, 8, 5, 45, 1e-4], # No more resources, but expected to be weaker than the best parameters
          #  [2, 4, 5, 45, 1e-4], # No more resources, but expected to be weaker than the best parameters
        ],
        [
          #  [2, 4, 5, 45, 2e-4], # Training Loss 0.208100 | Perplexity 20.196
          #  [2, 8, 5, 45, 2e-4], # Training Loss 0.255200 | Perplexity 20.316
           [4, 8, 5, 45, 2e-4], # Training Loss  | Perplexity
           [4, 4, 5, 45, 2e-4], # Training Loss  | Perplexity

           [2, 4, 5, 45, 1e-4], # Training Loss  | Perplexity
           [2, 8, 5, 45, 1e-4], # Training Loss  | Perplexity
           [4, 8, 5, 45, 1e-4], # Training Loss  | Perplexity
           [4, 4, 5, 45, 1e-4], # Training Loss  | Perplexity
        ],
    ]

## Set used parameters and start set training
used_parameters = parameter_sets[SETTING]
for parameters in used_parameters:
    train_evaluate_sets(parameters, main_model, main_tokenizer)

GPU = Tesla T4. Max memory = 14.741 GB.
11.268 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 45
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7749
2,1.7306
3,1.7457
4,1.6517
5,1.5214
6,1.378
7,1.1996
8,0.9776
9,0.8091
10,0.631


711.9926 seconds used for training.
11.87 minutes used for training.
Peak reserved memory = 11.268 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 76.44 %.
Peak reserved memory for training % of max memory = 0.0 %.


Evaluating: 100%|██████████| 135/135 [12:21<00:00,  5.49s/it]


Parameters: 2_4_5_45_0.0002 Perplexity: 20.196958541870117
GPU = Tesla T4. Max memory = 14.741 GB.
11.287 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 45
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7524
2,1.752
3,1.7287
4,1.6561
5,1.5277
6,1.348
7,1.1575
8,0.9618
9,0.8133
10,0.6137


1413.6282 seconds used for training.
23.56 minutes used for training.
Peak reserved memory = 11.287 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 76.569 %.
Peak reserved memory for training % of max memory = 0.0 %.


Evaluating: 100%|██████████| 135/135 [12:21<00:00,  5.49s/it]


Parameters: 2_8_5_45_0.0002 Perplexity: 20.316688537597656
GPU = Tesla T4. Max memory = 14.741 GB.
11.287 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,876 | Num Epochs = 1 | Total steps = 45
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,1.7522
2,1.7462
3,1.7146
4,1.6529
5,1.5182
6,1.3555
7,1.1676
8,0.965
9,0.7985
10,0.6522


##### Other Utilities

In [None]:
## LORA Saving
if False:
    model.save_pretrained("lora_model")  # Local saving
    tokenizer.save_pretrained("lora_model")
    # model.push_to_hub("your_name/lora_model", token = "...") # Online saving
    # tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
## Loading LORA Model
lora_model = "/content/drive/MyDrive/Fine Tuning (Alt)/2_4_5_60_0.0002_13.407_lora_model" # Lora Model to be loaded

if True:
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = lora_model,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
# Download without losing runtime
import time

# 5 minutes
i = 0
while i<100:
  time.sleep(3)
  i+=1

##### Initial Testing

In [None]:
## Alpaca Prompt Copy
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Alpaca Prompt -> Output
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Generate a test case for the following UI Element: Link Element 'Euskara' With URL https://eu.wikipedia.org/wiki/ from the website: https://en.wikipedia.org/wiki/Main_Page", # instruction #
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")
output = model.generate(**inputs, max_new_tokens = 64, use_cache = True)

print(tokenizer.batch_decode(output))

# Expected Prompt -> Output
from transformers import TextStreamer
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    "Link Element 'Euskara' With URL https://eu.wikipedia.org/wiki/ from the website: https://en.wikipedia.org/wiki/Main_Page"
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
output = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

print(tokenizer.batch_decode(output))