In [1]:
%%capture
#!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
#!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#!pip install unsloth==2025.3.6 unsloth_zoo==2025.3.4

Getting the model and the tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch
import json
import datetime
import os

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
r = 256 # Rank of the LoRA adapters.

task_name = "task_name" # Choose from ["reporting_fire", "reporting_flooding", "reporting_collision", "reporting_grounding":
                            #              "reporting_list-danger_of_capsizing", "reporting_sinking", "reporting_attack",
                            #              "reporting_person_overboard", "reporting_drift", "reporting_undesignated_distress"]

load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
chatter_path = './experiments/./...._new_outputs.json' # Synthetic finetuning data to be used as finetuning dataset. Generated by generate_instances.py

### Training hyperparameters ###
num_epoch = 10
learning_rate = 2e-4
lr_scheduler_type = "linear"
warmup_steps = 30
use_lora_adapter = True
is_wandb = True

### Model variables ###
model_dir = "path-to-base-llm"
used_model = "Llama-3.1-8B" # model_name_for_documentation, change if you use a different LLM

### BELOW VARIABLES CAN STAY THE SAME ###
model_path = f"./models/{task_name}"
results_path = f"{model_path}/{used_model}"#f'{model_path}/{used_model}'


config = {
        "max_seq_length": max_seq_length,
        "dtype": dtype,
        "load_in_4bit": load_in_4bit,
        "model_dir": model_dir,
        "used_model": used_model,
        "model_path": model_path,
        "r": r,
        "chatter_path": chatter_path,
        "results_path": results_path,
        "num_epoch": num_epoch,
        "learning_rate": learning_rate,
        "lr_scheduler_type": lr_scheduler_type,
        "warmup_steps": warmup_steps,
        "use_lora_adapter": use_lora_adapter
    }

if is_wandb:
    import wandb
    wandb.init(project="llama_finetuning", reinit=True, config= config)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_dir,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mguersel-akdeniz[0m ([33mcml-marfm[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


==((====))==  Unsloth 2025.3.6: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 1. Max memory: 47.319 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Getting the lora adapter

In [None]:
if use_lora_adapter:
    model = FastLanguageModel.get_peft_model(
        model,
        r = r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

Unsloth 2025.3.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Preparing the dataset and the prompt

In [3]:
from datasets import Dataset
# Load your JSON data
with open(chatter_path, 'r') as file:
    data = json.load(file)

prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

Input:
{}

### Output:
{}"""

EOS_TOKEN = "<|end_of_text|>"

def formatting_prompts(data):
    instruction = data[task_name]["instruction"]
    instances = data[task_name]["instances"]
    formatted_data = []

    for instance in instances:
        input_data = instance["input"]
        output_data = instance["output"]
        input_text = json.dumps(input_data, ensure_ascii=False, indent=4)  # Convert input dictionary to a JSON-formatted string
        output_text = '\n'.join(output_data)  # Join output list into a single string

        text = prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        formatted_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text,
            "text": text
        })
    
    return formatted_data

def formatting_prompts_all(files):
    formatted_data = []
    for chatter_file in files:
        with open(chatter_file, 'r') as file:
            data = json.load(file)
        instruction = data[data["task_name"]]["instruction"]
        instances = data[data["task_name"]]["instances"]

        for instance in instances:
            input_data = instance["input"]
            output_data = instance["output"]
            input_text = json.dumps(input_data, ensure_ascii=False, indent=4)  # Convert input dictionary to a JSON-formatted string
            output_text = '\n'.join(output_data)  # Join output list into a single string

            text = prompt.format(instruction, input_text, output_text) + EOS_TOKEN
            formatted_data.append({
                "instruction": instruction,
                "input": input_text,
                "output": output_text,
                "text": text
            })
    
    return formatted_data

# Create the formatted dataset
formatted_dataset = formatting_prompts(data)

# Convert the formatted data into a Hugging Face Dataset
dataset = Dataset.from_list(formatted_dataset)

#dataset = dataset.shuffle(seed=42)

In [4]:
dataset[499]

{'instruction': 'Generate a maritime radio chatter. A vessel makes a distress call and reports armed attack/piracy.',
 'input': '{\n    "vessel_name": "HERUN ZHEJIANG",\n    "vessel_MMSI": "four seven seven four three nine five zero zero",\n    "vessel_call_sign": null,\n    "vessel_type": "Motor Vessel",\n    "vessel_coordinate_dms": "three two degrees five six point eight three minutes South, five one degrees two four minutes West",\n    "compass_direction": "south east",\n    "closest_place_name": "El Aduar",\n    "distance_to_nearest_place": "five nine",\n    "closest_place_country": "Brazil",\n    "distance_to_nearest_port": "six zero",\n    "nearest_port": "Estação Naval do Rio Grande",\n    "distance_to_nearest_harbor": "one six eight",\n    "nearest_harbor": "Puerto de la Paloma",\n    "digit_by_digit": true,\n    "can_have_cargo": null,\n    "closest_water_body": null\n}',
 'output': 'Mayday, Mayday, Mayday. This is motor vessel HERUN ZHEJIANG, MMSI four seven seven four three

Setting up the trainer

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = warmup_steps,
        num_train_epochs = num_epoch, # Set this for 1 full training run.
        #max_steps = None,
        learning_rate = learning_rate,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = lr_scheduler_type,
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
    ),
)

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

In [6]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX A6000. Max memory = 47.319 GB.
15.08 GB of memory reserved.


Training

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 15 | Total steps = 930
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520/8,051,232,768 (0.26% trained)


Unsloth: Will smartly offload gradients to save VRAM!


KeyboardInterrupt: 

In [8]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2563.3863 seconds used for training.
42.72 minutes used for training.
Peak reserved memory = 22.6 GB.
Peak reserved memory for training = 5.094 GB.
Peak reserved memory % of max memory = 47.761 %.
Peak reserved memory for training % of max memory = 10.765 %.


Inference with the adapter only

Save the adapter and the tokenizer

In [None]:
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M")

if load_in_4bit:
    quantization = '_4bit'
else:
    quantization = ''

if use_lora_adapter:
    lora = '_lora'
else:
    lora = ''

model.save_pretrained(f"{results_path}{quantization}{lora}_{current_time}") # Local saving
tokenizer.save_pretrained(f"{results_path}{quantization}{lora}_{current_time}")

data_to_save = {
        "task_name": task_name,
        "load_in_4bit": load_in_4bit,
        "is_lora": use_lora_adapter,
        "model_name": used_model,
        "model_path": f"{results_path}{quantization}{lora}_{current_time}",
        "r": r,
        "chatter_path": chatter_path,
        "warmup_steps": warmup_steps,
        "num_epoch": num_epoch,
        "learning_rate": learning_rate,
        "lr_scheduler_type": lr_scheduler_type,
        "trainer_stats": trainer_stats,
}

# Save results to JSON
with open(f'{results_path}{quantization}{lora}_{current_time}/hyperparameters.json', 'w') as outfile:
    json.dump(data_to_save, outfile, ensure_ascii=False, indent=4)