In [1]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes scipy

In [1]:
import torch
import os
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Load Small Dataset**

In [None]:
model_id = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
eos_token = tokenizer.eos_token

print(f"EOS Token: {eos_token}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


EOS Token: <eos>


In [2]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1200, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-small-clean-12k' (split: 'train')...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1200
})
Splitting dataset into train and validation sets...
Train set size: 1080
Validation set size: 120


In [4]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [00:18<00:00,  4.70s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9866
✅ Exact Match Accuracy: 86.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                         transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                            predicted_items  similarity
9   Hey, 




### **Medium 1b Model, Small Dataset**


In [5]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-medium-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [00:40<00:00, 10.18s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7439
✅ Exact Match Accuracy: 15.83%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                      transcribed_text                                                                                                                                                                                                                                                                                            items                                                                                                                                                                                                                                                                                                                                               




### **Large 1b Model, Small Dataset**

In [6]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-large-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:17<00:00, 19.38s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.4812
✅ Exact Match Accuracy: 0.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

### **Medium 4b Model, Small Dataset**


In [3]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-medium-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:04<00:00, 16.10s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7324
✅ Exact Match Accuracy: 13.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                               items                                                                                                                                                                                                                                                                                                                                                                      




### **Large 4b Model, Small Dataset**

In [4]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-large-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:30<00:00, 22.72s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.5000
✅ Exact Match Accuracy: 0.83%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

### **Medium 12b Model, Small Dataset**


In [3]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-medium-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:22<00:00, 20.74s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7303
✅ Exact Match Accuracy: 20.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                          transcribed_text                                                                                                                                                                                                                                                                                            items                                                                                                                                                                                                                                                                                                                                           




### **Large 12b Model, Small Dataset**

In [3]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-large-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [02:16<00:00, 34.01s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.4823
✅ Exact Match Accuracy: 0.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

# **Load Medium Dataset**

In [2]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-medium-clean-18k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1800, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-medium-clean-18k' (split: 'train')...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1800
})
Splitting dataset into train and validation sets...
Train set size: 1620
Validation set size: 180


### **Small 1b Model, Medium Dataset**

In [10]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [00:47<00:00,  7.95s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9310
✅ Exact Match Accuracy: 52.78%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                      




### **Large 1b Model, Medium Dataset**

In [11]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-large-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [01:31<00:00, 15.21s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8234
✅ Exact Match Accuracy: 21.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                    transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                   items                          

### **Small 4b Model, Medium Dataset**

In [6]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-small-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [00:59<00:00,  9.93s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9639
✅ Exact Match Accuracy: 70.56%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                      transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                        items                                                                   




### **Large 4b Model, Medium Dataset**

In [7]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-large-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [02:00<00:00, 20.05s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8296
✅ Exact Match Accuracy: 26.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                           transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         it

### **Small 12b Model, Medium Dataset**

In [3]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-small-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [01:34<00:00, 15.80s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9637
✅ Exact Match Accuracy: 78.89%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                       transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 items                         




### **Large 12b Model, Medium Dataset**

In [3]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-large-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-large-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 6/6 [03:04<00:00, 30.80s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8233
✅ Exact Match Accuracy: 19.44%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                    transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

# **Load Large Dataset**

In [2]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-large-clean-12k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1200, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-large-clean-12k' (split: 'train')...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1200
})
Splitting dataset into train and validation sets...
Train set size: 1080
Validation set size: 120


### **Small 1b Model, Large Dataset**

In [15]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches:  12%|█▎        | 1/8 [00:10<01:12, 10.31s/it]

Attempted to parse (after extraction): '[{"customizations":[],"name":"Apple Crumble"},{"customizations":[],"name":"Lemonade"},{"customizations":["Ketchup","Vegan Mayo"],"name":"Vegan Burger"},{"customizations":[],"name":"Fries"},{"customizations":[],"name":"Extra Sauce (Pesto, BBQ Sauce, Gravy)","count":5"}]'


Inference Batches:  50%|█████     | 4/8 [00:38<00:38,  9.60s/it]

Attempted to parse (after extraction): '[{"customizations":[],"name":"Extra Bun or Bread Roll"},{"customizations":["Coke","Sprite","Fanta"],"name":"Soft Drinks"},{"customizations":[],"name":"Extra Whipped Cream"},{"customizations":["Vegetarian"],"name":"Spring Rolls"},{"customizations":[],"name":"Lemon Meringue Pie"},{"customizations":[],"name":"Extra Sauce (Pesto","Gravy"},{"customizations":[],"name":"Strawberry"},{"customizations":[],"name":"Cream of Mushroom Soup with Truffle Oil","Chopped Parsley"},{"customizations":["Vegan Cheese","Vegan Mayo"],"name":"Vegetarian Burger"},{"customizations":["Garlic Soy","Peanut Sauce"],"name":"Extra Side"}]'


Inference Batches: 100%|██████████| 8/8 [01:15<00:00,  9.41s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7438
✅ Exact Match Accuracy: 8.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               transcribed_text                                                                                                                                                                                                                                                        

### **Medium 1b Model, Large Dataset**


In [16]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) in torch.bfloat16
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-medium-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches:  88%|████████▊ | 7/8 [01:19<00:11, 11.73s/it]

Attempted to parse (after extraction): '[{"customizations":["Lemon Dill"],"name":"Grilled Salmon"},{"customizations":[],"name":"Extra Ice Cream Scoop"},{"customizations":[],"name":"Strawberry"},{"customizations":["BBQ"],"name":"Grilled Chicken Breast"},{"customizations":["Chili Oil","Toasted Sesame Seeds"],"name":"Miso Soup"},{"customizations":[],"name":"Mint Chocolate Chip"},{"customizations":[],"name":"Extra Topping":["Berries","Nuts"],"name":"Extra Ice Cream Scoop"},{"customizations":[],"name":"Lemon Meringue Pie"},{"customizations":[],"name":"Vanilla"},{"customizations":["Vegetarian","Chicken"],"name":"Spring Rolls"}]'


Inference Batches: 100%|██████████| 8/8 [01:30<00:00, 11.33s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8831
✅ Exact Match Accuracy: 32.50%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               transcribed_text                                                                                                                                                                                                                                                       




### **Small 4b Model, Large Dataset**

In [9]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-small-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [00:57<00:00, 14.44s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9248
✅ Exact Match Accuracy: 40.83%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                          




### **Medium 4b Model, Large Dataset**


In [10]:
# ISSUE - RESOLVED

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-medium-json-16bit) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:04<00:00, 16.06s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9212
✅ Exact Match Accuracy: 45.83%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  transcribed_text                                                                                                                                                                                                                                                                                                                                                    




### **Small 12b Model, Large Dataset**

In [3]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-small-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-small-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:30<00:00, 22.51s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9314
✅ Exact Match Accuracy: 49.17%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                          




### **Medium 12b Model, Large Dataset**


In [3]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-medium-json-16bit"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

if not torch.cuda.is_available():
     raise SystemError("CUDA is not available. This script requires a GPU.")
else:
     device_name = torch.cuda.get_device_name(0)
     print(f"CUDA is available. Using GPU: {device_name}")
     if torch.cuda.get_device_capability(0)[0] >= 8:
         print("GPU supports bfloat16, using torch.bfloat16 for inference.")
         model_dtype_inference = torch.bfloat16
     else:
         print("GPU does not support bfloat16, using torch.float16 for inference.")
         model_dtype_inference = torch.float16
     device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model ---
print(f"Loading base model ({base_model_id}) in {model_dtype_inference}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=model_dtype_inference,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=500, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
GPU supports bfloat16, using torch.bfloat16 for inference.
Cleaning up memory before loading...
CUDA cache cleared.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) in torch.bfloat16


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-medium-json-16bit) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:33<00:00, 23.41s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9455
✅ Exact Match Accuracy: 53.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                               transcribed_text                                                                                                                                                                                                                                                                                                                                                       


