# Fine-tuning Llama

## Installing Dependencies and setting the enviroment

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()

# 1. FIX SYMPY (Must happen before Unsloth)
!pip uninstall sympy -y
!pip install "sympy==1.12"

# 2. INSTALL UNSLOTH (Fast version)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# 3. INSTALL DEPENDENCIES
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes wandb

In [None]:
from huggingface_hub import login
from huggingface_hub import whoami

from google.colab import userdata
HF_TOKEN = userdata.get("HF_TOKEN")
# Interactive login - just paste your token when the box appears
login(token = HF_TOKEN)

try:
    print(f"‚úÖ Success! You are logged in as: {whoami()['name']}")
except:
    print("‚ùå Login failed. Please try again.")

‚úÖ Success! You are logged in as: AhmetYusufOzturk


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2026.1.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import json
from datasets import load_dataset

dataset = load_dataset("json", data_files="llama3_multilingual_data.jsonl", split="train")

def formatting_prompts_func(examples):
    texts = []
    for i in range(len(examples["input_json"])):
        input_data = examples["input_json"][i]

        # English Example
        text_en = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: English.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {input_data}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{examples["lang_en"][i]}<|eot_id|>"""
        texts.append(text_en)

        # Turkish Example
        text_tr = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: Turkish.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {input_data}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{examples["lang_tr"][i]}<|eot_id|>"""
        texts.append(text_tr)

    return {"text": texts}

# This line fixes the "Row Mismatch" error
dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)

print(f"Dataset Size: {len(dataset)}")
print(dataset[0]["text"])

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

Dataset Size: 738
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: English.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {"time":"10:00","team":"Fenerbah√ße","player":"Wilbekin","action":"3pt_shot","result":"miss"}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Wilbekin opens the game with a deep three, but it rattles out.<|eot_id|>


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 240,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",
    ),
)

trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/738 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 738 | Num Epochs = 3 | Total steps = 240
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
wandb: Currently logged in as: bandan161 (ahmet-yusuf-ozturk) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,5.0428
2,5.3037
3,4.8768
4,4.9395
5,4.8287
6,4.4597
7,4.1696
8,3.5091
9,3.0025
10,2.4268


0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñà‚ñÑ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ
train/learning_rate,‚ñÇ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñá‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
total_flos,7448848994156544.0
train/epoch,2.58537
train/global_step,240.0
train/grad_norm,0.84791
train/learning_rate,0.0
train/loss,1.0522
train_loss,1.3372
train_runtime,660.3414
train_samples_per_second,2.908
train_steps_per_second,0.363


TrainOutput(global_step=240, training_loss=1.3372016628583272, metrics={'train_runtime': 660.3414, 'train_samples_per_second': 2.908, 'train_steps_per_second': 0.363, 'total_flos': 7448848994156544.0, 'train_loss': 1.3372016628583272, 'epoch': 2.5853658536585367})

In [None]:
FastLanguageModel.for_inference(model)

def test_multilingual_robust(input_json_str, language):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: {language}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {input_json_str}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # Force the model to generate at least 10 tokens, but stop correctly at the end
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,      # Increased from 64 to prevent cut-offs
        min_new_tokens=10,       # Force it to finish the sentence
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.8,         # Increased slightly for more creativity
        do_sample=True
    )

    # Decode
    decoded = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    # Aggressive Cleaning: Stop at the first double newline or new header
    if "<|eot_id|>" in decoded:
        decoded = decoded.split("<|eot_id|>")[0]

    # Remove any trailing repetition/artifacts
    decoded = decoded.split("user")[0] # Stop if it tries to generate a new user prompt

    return decoded.strip()

# --- RE-RUN TEST ---
test_data = '{"time": "00:02", "team": "Fenerbah√ße", "player": "Nigel Hayes-Davis", "action": "3pt_shot", "result": "make"}'

print(f"üèÄ INPUT: {test_data}\n")
print(f"üá∫üá∏ EN: {test_multilingual_robust(test_data, 'English')}")
print(f"üáπüá∑ TR: {test_multilingual_robust(test_data, 'Turkish')}")

üèÄ INPUT: {"time": "00:02", "team": "Fenerbah√ße", "player": "Nigel Hayes-Davis", "action": "3pt_shot", "result": "make"}

üá∫üá∏ EN: Hayes-Davis hits the game-winner at the buzzer! Unbelievable!assistant

Hayes-Davis sends the crowd into a frenzy with a clutch shot at the horn!assistant

Hayes-Davis rises above the defense and drains the three to win it!assistant

Hayes-Davis delivers in the clutch once again, this time with a deep three!assistant

Hayes-Davis' deep heave at the buzzer... GOT IT!assistant

Hayes-Davis hits the half-court shot to win it! Unbelievable!
üáπüá∑ TR: Hayes-Davis √ßok uzaklardan! Pota dibinde isabeti buldu!assistant

Hayes-Davis m√ºthi≈ü bir ≈üut! Pota dibinde isabet!assistant

Hayes-Davis‚Äôden m√ºthi≈ü bir √º√ßl√ºk! Pota dibinde isabet!assistant

Hayes-Davis pota dibinde bombo≈ü! √ú√ßl√ºk isabet!assistant

Hayes-Davis‚Äôden m√ºthi≈ü bir ≈üut! Pota dibinde isabet!assistant

Hayes-Davis √ßok uzaklardan!


In [None]:

FastLanguageModel.for_inference(model)

def generate_clean_commentary(input_json_str, language):
    # 1. Prompt
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: {language}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {input_json_str}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    # 2. Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.6, # Lower temp helps reduce hallucinations like "Pota dibinde"
        do_sample=True
    )

    # 3. Decode
    raw_output = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

    # 4. CLEANING LOGIC (The Fix)
    # Step A: Cut off if it starts repeating "assistant"
    if "assistant" in raw_output:
        cleaned = raw_output.split("assistant")[0]
    else:
        cleaned = raw_output

    # Step B: Remove newlines that might separate multiple attempts
    cleaned = cleaned.strip().split('\n')[0]

    return cleaned

# --- FINAL TEST ---
test_data = '{"time": "00:02", "team": "Fenerbah√ße", "player": "Nigel Hayes-Davis", "action": "3pt_shot", "result": "make"}'

print(f"üèÄ INPUT: {test_data}\n")
print(f"üá∫üá∏ EN: {generate_clean_commentary(test_data, 'English')}")
print(f"üáπüá∑ TR: {generate_clean_commentary(test_data, 'Turkish')}")

üèÄ INPUT: {"time": "00:02", "team": "Fenerbah√ße", "player": "Nigel Hayes-Davis", "action": "3pt_shot", "result": "make"}

üá∫üá∏ EN: Hayes-Davis at the buzzer... YES! What a way to win!
üáπüá∑ TR: Hayes-Davis'ƒ±n eli ƒ±sƒ±ndƒ±! Panyalƒ± sahada bombayƒ± patlatƒ±yor!


In [None]:
# Save to Colab Local Directory
model.save_pretrained("llama3_basketball_adapter")
tokenizer.save_pretrained("llama3_basketball_adapter")

# Zip and copy to Google Drive (Safest method)
import shutil
from google.colab import drive
drive.mount('/content/drive')

shutil.make_archive('llama3_basketball_adapter', 'zip', 'llama3_basketball_adapter')
shutil.move('llama3_basketball_adapter.zip', '/content/drive/MyDrive/llama3_basketball_adapter.zip')
print("Saved to Google Drive successfully!")

Mounted at /content/drive
Saved to Google Drive successfully!
