# Setting the enviroment

In [None]:
%%capture
# 1. Install Unsloth & Dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes pandasy


In [10]:
!unzip llama3_basketball_adapter.zip -d llama3_basketball_adapter

Archive:  llama3_basketball_adapter.zip
replace llama3_basketball_adapter/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: llama3_basketball_adapter/README.md  
replace llama3_basketball_adapter/special_tokens_map.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: llama3_basketball_adapter/special_tokens_map.json  
replace llama3_basketball_adapter/adapter_config.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: llama3_basketball_adapter/adapter_config.json  
replace llama3_basketball_adapter/tokenizer.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: llama3_basketball_adapter/tokenizer.json  
replace llama3_basketball_adapter/adapter_model.safetensors? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: llama3_basketball_adapter/adapter_model.safetensors  
  inflating: llama3_basketball_adapter/tokenizer_config.json  


# Setting the models

In [1]:
import torch
import pandas as pd
print(torch.cuda.is_available() and torch.cuda.device_count() > 0)

True


In [11]:
import torch
import pandas as pd
from unsloth import FastLanguageModel

# --- CONFIGURATION ---
ADAPTER_PATH = "llama3_basketball_adapter"
BASE_MODEL_NAME = "unsloth/llama-3-8b-bnb-4bit"

# Test Cases

In [20]:
test_cases = [
    # --- CLUTCH MOMENTS ---
    {
        "id": 1,
        "input_json": '{"time": "00:01", "team": "Fenerbah√ße", "player": "Nigel Hayes-Davis", "action": "3pt_shot", "result": "make"}'
    },
    {
        "id": 2,
        "input_json": '{"time": "00:00", "team": "Anadolu Efes", "player": "Shane Larkin", "action": "drive_layup", "result": "make"}'
    },

    # --- REGULAR PLAY (TURNOVERS & BLOCKS) ---
    {
        "id": 3,
        "input_json": '{"time": "09:45", "team": "Fenerbah√ße", "player": "Scottie Wilbekin", "action": "turnover", "result": "steal"}'
    },
    {
        "id": 4,
        "input_json": '{"time": "08:12", "team": "Fenerbah√ße", "player": "Serta√ß ≈ûanlƒ±", "action": "block", "result": "success"}'
    },

    # --- REBOUNDS (OFFENSIVE vs DEFENSIVE) ---
    {
        "id": 5,
        "input_json": '{"time": "05:14", "team": "Fenerbah√ße", "player": "Dyshawn Pierre", "action": "rebound", "result": "offensive"}'
    },
    {
        "id": 6,
        "input_json": '{"time": "06:30", "team": "Anadolu Efes", "player": "Tyrique Jones", "action": "rebound", "result": "defensive"}'
    },

    # --- FOULS ---
    {
        "id": 7,
        "input_json": '{"time": "01:50", "team": "Fenerbah√ße", "player": "Johnathan Motley", "action": "shooting_foul", "result": "foul_out"}'
    },
    {
        "id": 8,
        "input_json": '{"time": "02:30", "team": "Fenerbah√ße", "player": "Melih Mahmutoƒülu", "action": "technical_foul", "result": "penalty"}'
    },

    # --- FLOW & ASSISTS ---
    {
        "id": 9,
        "input_json": '{"time": "06:05", "team": "Anadolu Efes", "player": "Elijah Bryant", "action": "fast_break_layup", "result": "make"}'
    },
    {
        "id": 10,
        "input_json": '{"time": "03:15", "team": "Fenerbah√ße", "player": "Tyler Dorsey", "action": "drive_layup", "result": "miss"}'
    }
]

# Test Function

In [16]:
def run_tests_bilingual(model, tokenizer, test_data, label):
    results = []
    FastLanguageModel.for_inference(model)
    print(f"\nüöÄ Running Bilingual Tests for: {label}")

    for item in test_data:
        # We run the test TWICE for each row: Once for EN, Once for TR
        languages = ["English", "Turkish"]

        for lang_name in languages:

            # 1. Construct Prompt (Using your proven format)
            prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a basketball commentator. Language: {lang_name}.<|eot_id|><|start_header_id|>user<|end_header_id|>

Input Data: {item['input_json']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
            inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

            # 2. Generate
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.6
            )

            # 3. Decode & Clean
            raw_output = tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]

            if "assistant" in raw_output:
                cleaned = raw_output.split("assistant")[-1]
            else:
                cleaned = raw_output

            cleaned = cleaned.strip().split('\n')[0]

            results.append({
                "ID": item['id'],
                "Target_Language": lang_name,
                "Input_JSON": item['input_json'],
                "Model_Output": cleaned,
                "Model_Version": label
            })

            print(f"[ID {item['id']} - {lang_name}] {cleaned[:50]}...")

    return results

# Test the base model

In [18]:
print("üîµ Loading BASE Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_NAME,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

base_results = run_tests_bilingual(model, tokenizer, test_cases, "Base Model")

üîµ Loading BASE Model...
==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

üöÄ Running Bilingual Tests for: Base Model
[ID 1 - English] Output Data: {"time": "00:01", "team": "Fenerbah√ße...
[ID 1 - Turkish] Output: Fenerbah√ße'nin Nigel Hayes-Davis, 3 sayƒ±lƒ±...
[ID 2 - English] ": "Sergio Llull"}–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫...
[ID 2 - Turkish] Output Data: {"time": "00:00", "team": "Anadolu Ef...
[ID 3 - English] Output Data: {"time": "09:45", "team": "Fenerbah√ße...
[ID 3 - Turkish] Output Data: "09:45 - Fenerbah√ße'li Scottie Wilbek...
[ID

# Test fine-tuned model

In [19]:
print("\nüü¢ Loading ADAPTER...")
model.load_adapter(ADAPTER_PATH)

ft_results = run_tests_bilingual(model, tokenizer, test_cases, "Fine-Tuned")


üü¢ Loading ADAPTER...

üöÄ Running Bilingual Tests for: Fine-Tuned
[ID 1 - English] Hayes-Davis is a superstar! He hits the game-winne...
[ID 1 - Turkish] Hayes-Davis'ten m√ºthi≈ü bir ≈üut! Ma√ßta son isabet!...
[ID 2 - English] Larkin's clutch play secures the win for Ef...
[ID 2 - Turkish] Efes trib√ºnleri ayaƒüa kalktƒ±! Ma√ßƒ±n son dakikalarƒ±...
[ID 3 - English] ...
[ID 3 - Turkish] Wilbekin savunmayƒ±...
[ID 4 - English] Not in his house! ≈ûannie sends it back with a mons...
[ID 4 - Turkish] Devasa bir...
[ID 5 - English] Pierre secures the rebound and puts it back in...
[ID 5 - Turkish] Pierre...
[ID 6 - English] ...
[ID 6 - Turkish] Jones boyalƒ± alanda g√º√ßl√º bir pozisyonda, savunma ...
[ID 7 - English] Motley picks up his fifth foul and is fouled out....
[ID 7 - Turkish] Motley fa...
[ID 8 - English] Paris steps to the line and calmly sinks the techn...
[ID 8 - Turkish] Mahmutoƒülu faul yaptƒ±,...
[ID 9 - English] Bryant draws...
[ID 9 - Turkish] Bry...
[ID 10 - English

# Saving the results

In [31]:
all_results = base_results + ft_results
df = pd.DataFrame(all_results)
df.to_csv("comparison_results.csv", index=False)

print("\n‚úÖ DONE! Download 'comparison_results.csv' from the sidebar.")
print(df[["Target_Language","Model_Version", "Model_Output"]].head(4)) # Show comparison preview


‚úÖ DONE! Download 'comparison_results.csv' from the sidebar.
  Target_Language Model_Version  \
0         English    Base Model   
1         Turkish    Base Model   
2         English    Base Model   
3         Turkish    Base Model   

                                        Model_Output  
0  Output Data: {"time": "00:01", "team": "Fenerb...  
1  Output: Fenerbah√ße'nin Nigel Hayes-Davis, 3 sa...  
2  ": "Sergio Llull"}–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥–ø—Ä–∏–∫–ª–∞–¥...  
3  Output Data: {"time": "00:00", "team": "Anadol...  


# LLM as a Judge

### Evaluation Table

| ID | Language | Winner | FT Score (1-10) | Reasoning (Brief) |
| :--- | :--- | :--- | :--- | :--- |
| 1 | English | **Fine-Tuned** | 10 | FT captured the excitement and specific "buzzer-beater" context perfectly. Base failed by outputting JSON. |
| 1 | Turkish | **Fine-Tuned** | 9 | FT used excellent local phrasing ("son isabet"). Base failed grammar ("maalesef bir fail"). |
| 2 | English | **Fine-Tuned** | 8 | FT logic is good, though slightly cut off at the end. Base hallucinated "Sergio Llull" and outputted gibberish/Russian. |
| 2 | Turkish | **Fine-Tuned** | 8 | FT tone is enthusiastic ("trib√ºnleri ayaƒüa kalktƒ±"), though cut off. Base outputted JSON/nonsense. |
| 3 | English | **Fine-Tuned** | 1 | **Note:** FT output appears empty/missing in CSV. However, Base outputted JSON + Cyrillic ("–ø—Ä–∏–∫–ª–∞–¥"). FT wins by technicality of not outputting garbage, but both failed. |
| 3 | Turkish | **Fine-Tuned** | 5 | FT started a correct sentence but was cut off. Base failed instruction (JSON dump). |
| 4 | English | **Fine-Tuned** | 9 | Great idiomatic expression ("Not in his house!"). Base gave a meta-description of the data. |
| 4 | Turkish | **Fine-Tuned** | 5 | Tone was good ("Devasa bir") but the output was severely truncated. Base failed instruction. |
| 5 | English | **Fine-Tuned** | 9 | FT correctly inferred the "put back" from an offensive rebound. Base was robotic/meta. |
| 5 | Turkish | **Fine-Tuned** | 4 | FT output was only one word ("Pierre"). Base outputted English text for the Turkish row. FT wins on language adherence. |
| 6 | English | **Fine-Tuned** | 1 | FT output appears empty. Base outputted "Output Data" wrapper. Both failed, but Base violated negative constraints explicitly. |
| 6 | Turkish | **Fine-Tuned** | 6 | FT context was correct ("boyalƒ± alanda") but sentence was incomplete. Base outputted JSON. |
| 7 | English | **Fine-Tuned** | 10 | Perfect translation of "foul_out" to commentary. Base dumped JSON. |
| 7 | Turkish | **Fine-Tuned** | 5 | FT started correctly but cut off. Base went into an infinite repetition loop. |
| 8 | English | **Fine-Tuned** | 3 | **Hallucination Alert:** FT mentions "Paris" (Paris Lee?), but the JSON player is Melih Mahmutoƒülu. Base failed with JSON/Cyrillic loop. |
| 8 | Turkish | **Fine-Tuned** | 7 | Accurate start to the sentence. Base failed with JSON/Cyrillic loop. |
| 9 | English | **Fine-Tuned** | 6 | Accurate logic, slightly cut off. Base outputted a Python docstring explanation. |
| 9 | Turkish | **Fine-Tuned** | 2 | FT output "Bry" (incomplete). Base outputted gibberish/code artifacts. |
| 10 | English | **Fine-Tuned** | 9 | Good interpretation of the drive and miss. Base used "The Fenerbah√ße's..." (poor grammar) and meta-style. |
| 10 | Turkish | **Fine-Tuned** | 8 | Natural phrasing ("sert gitti ama"). Base failed instruction (JSON dump). |

---

### Summary Report

#### 1. Hallucination Rate
*   **Base Model:** **Critical Failure.** The Base model hallucinated significantly, often generating Cyrillic text (Russian word "–ø—Ä–∏–∫–ª–∞–¥" meaning "example"), infinite loops (ID 7, 8), and phantom entities ("Sergio Llull" in ID 2). It also hallucinated English text when prompted for Turkish (ID 5).
*   **Fine-Tuned Model:** **Low.** The Fine-Tuned model generally stuck to the entities provided in the JSON. However, it did exhibit one specific hallucination in **ID 8 (English)**, where it introduced a player named "Paris" who was not in the source JSON (likely confusing the context with an opponent player, Paris Lee).

#### 2. Adherence to JSON Constraints & Instructions
*   **Base Model:** **Fail.** The Base model consistently failed the negative constraint: "Do not output JSON/Output Data." It almost exclusively outputted `Output Data: {JSON}` or meta-descriptions like "The program takes an input dictionary."
*   **Fine-Tuned Model:** **High.** The Fine-Tuned model successfully learned to strip the artifacts and output *only* the commentary. It successfully adopted the persona of a commentator rather than a data parser.

#### 3. Quality of Turkish Translations & Formatting
*   **Base Model:** Poor. It often treated Turkish prompts as English or outputted grammatically broken Turkish (e.g., "maalesef bir fail").
*   **Fine-Tuned Model:** High Potential, but **Truncated**. The phrasing and idioms in Turkish were excellent and sounded like a real commentator (e.g., "son saniyede g√∂nderiyor," "isabet"). However, the Fine-Tuned model suffers from a **Stop Token/Length issue**, as nearly 50% of its outputs were cut off mid-sentence (e.g., "Motley fa", "Devasa bir").

**Conclusion:** The **Fine-Tuned Model** is the clear winner, transforming the raw data into exciting, idiomatic sports commentary. However, the model requires adjustment to its `max_tokens` or stop-sequence settings to prevent the frequent sentence truncation observed in the test set.