In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes scipy

In [None]:
import torch
import os
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# **Load Small Dataset**

In [None]:
model_id = "google/gemma-3-1b-it"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
eos_token = tokenizer.eos_token

print(f"EOS Token: {eos_token}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


EOS Token: <eos>


In [None]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1200, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-small-clean-12k' (split: 'train')...
Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1200
})
Splitting dataset into train and validation sets...
Train set size: 1080
Validation set size: 120
Applying formatting function to the datasets...
Dataset formatting complete.
Train dataset features: {'formatted_text': Value(dtype='string', id=None)}
Validation dataset features: {'formatted_text': Value(dtype='string', id=None)}


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [05:46<00:00, 43.37s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9216
✅ Exact Match Accuracy: 45.83%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                    transcribed_text                                                                                                                                                                                                                                                                                    items                                                                                                                                                                                                                                                                                                                                        predicted_items  




### **Medium 1b Model, Small Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-medium-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches:  38%|███▊      | 3/8 [02:11<02:49, 33.90s/it]

Attempted to parse (after extraction): '[{"customizations":["Toasted Sesame Seeds","Chili Oil"],"name":"Miso Soup"},{"customizations":[],"name":"Chocolate"},{"customizations":[],"name":"Strawberry"},{"customizations":[],"name":"Berry Parfait"},{"customizations":["Extra Berries"],"name":"Mashed Potatoes",""customizations":["Rice","Fries"],"name":"Mashed Potatoes"}]'


Inference Batches: 100%|██████████| 8/8 [06:07<00:00, 45.98s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.6947
✅ Exact Match Accuracy: 6.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                    transcribed_text                                                                                                                                                                                                                                                                                         items                                                                                                                                                                                                                                                                                                                                                     




### **Large 1b Model, Small Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-large-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [05:08<00:00, 38.57s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.5369
✅ Exact Match Accuracy: 1.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

### **Medium 4b Model, Small Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-medium-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [13:38<00:00, 102.29s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7279
✅ Exact Match Accuracy: 11.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                   transcribed_text                                                                                                                                                                                                                                                                               items                                                                                                                                                                                                                                                                                                                                                                                               




### **Large 4b Model, Small Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-large-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [15:25<00:00, 115.66s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.5536
✅ Exact Match Accuracy: 3.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

### **Medium 12b Model, Small Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-medium-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [46:22<00:00, 347.77s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.6712
✅ Exact Match Accuracy: 5.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                          transcribed_text                                                                                                                                                                                                                                                                                            items                                                                                                                                                                                                                                                                                                                                            

### **Large 12b Model, Small Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-large-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/274M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 8/8 [38:32<00:00, 289.04s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.5419
✅ Exact Match Accuracy: 1.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                       items                                                                                                                                                                                                                                                                                                              

# **Load Medium Dataset**

In [None]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-medium-clean-18k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1800, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-medium-clean-18k' (split: 'train')...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/518 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/487M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/449M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/381M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/472M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/610M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/516M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/490M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/383M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/450M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/385M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/357M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/346M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/365M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/344M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/386M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/380M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/451M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/357M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18000 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1800
})
Splitting dataset into train and validation sets...
Train set size: 1620
Validation set size: 180


### **Small 1b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches:  25%|██▌       | 3/12 [01:59<07:37, 50.78s/it]

Attempted to parse (after extraction): '[{"customizations":["Spicy"],"name":"Coleslaw"},{"customizations":["Fries","Fish and Chips"],"name":"Crispy"]'


Inference Batches:  33%|███▎      | 4/12 [02:09<04:38, 34.85s/it]

Attempted to parse (after extraction): '[{"customizations":["BBQ","Spicy"],"name":"Chicken Wings"},{"customizations":["Soy Ginger","Peanut Sauce"],"name":"Grilled Tofu"},{"customizations":["Coke","Lemonade","Extra Whipped Cream"],"name":"Soft Drinks","Extra Whipped Cream"}]'


Inference Batches: 100%|██████████| 12/12 [07:59<00:00, 39.96s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8279
✅ Exact Match Accuracy: 20.56%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                      




### **Large 1b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-large-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches:  58%|█████▊    | 7/12 [05:17<05:23, 64.75s/it]

Attempted to parse (after extraction): '[
  { "customizations": [], "name": "Grilled Chicken Breast" },
  { "customizations": ["Garlic Butter"], "name": "Grilled Salmon" },
  { "customizations": ["Lemon Dill","Garlic Butter"],"name":"Grilled Tofu" },
  { "customizations": ["Chili Lime","Soy Ginger"],"name":"Mint Chocolate Chip" },
  { "customizations": [], "name":"Almond Milk" }
  { "customizations": ["Oat Milk","Almond Milk"],"name":"Milk Alternatives" }
]'


Inference Batches:  67%|██████▋   | 8/12 [06:00<03:51, 57.89s/it]

Attempted to parse (after extraction): '[{"customizations":["Lemon Wedge"],"name":"Chicken Soup"},{"customizations":["No Protein"],"name":"Greek Salad"},{"customizations":["Crispy Onion Rings","Extra Crispy Fries"],"name":"Fish and Chips"},{"customizations":["Chili Lime"],"name":"Grilled Tofu"},{"customizations":[],"name":"Apple Pie"},{"customizations":[],"name":"Berry Parfait"},{"customizations":["Milk Alternatives"],"name":"Oat Milk","name":"Almond Milk"},{"customizations":["Extra Cheese"],"name":"Cheese"}}]'


Inference Batches: 100%|██████████| 12/12 [09:12<00:00, 46.02s/it]


Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.7267
✅ Exact Match Accuracy: 12.22%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                      

### **Small 4b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-small-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 12/12 [05:34<00:00, 27.85s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9651
✅ Exact Match Accuracy: 55.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  items                                   




### **Large 4b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA L4
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-large-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 12/12 [08:24<00:00, 42.03s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9113
✅ Exact Match Accuracy: 28.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                     transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               items                             




### **Small 12b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/109k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-small-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/274M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 12/12 [19:37<00:00, 98.08s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9718
✅ Exact Match Accuracy: 51.67%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                             transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                               items                                                                                     




### **Large 12b Model, Medium Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-large-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 16

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-large-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/274M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 180 rows.
Conversion complete.
Running batched inference (batch size: 16)...


Inference Batches: 100%|██████████| 12/12 [16:35<00:00, 82.94s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8859
✅ Exact Match Accuracy: 27.22%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                       transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                                               items                                                                           




# **Load Large Dataset**

In [None]:
from datasets import load_dataset
import json

dataset_id = "iTzMiNOS/voice-orders-large-clean-12k"
split_name = "train"

print(f"Loading dataset '{dataset_id}' (split: '{split_name}')...")
dataset = load_dataset(dataset_id, split=split_name)

columns_to_keep = ["transcribed_text", "items", "speaker"]
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

dataset = dataset.select(range(min(1200, len(dataset))))

print(f"Dataset loaded: {dataset}")

prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""

print("Splitting dataset into train and validation sets...")
train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_data = train_val_split['train']
validation_data = train_val_split['test']

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(validation_data)}")

def format_data_for_sft(example):
    text = example["transcribed_text"]
    items_data = example["items"]
    json_string = json.dumps(items_data, separators=(',', ':'))
    if 'tokenizer' not in globals():
        raise NameError("Tokenizer not found. Please run the tokenizer loading cell (Cell 5) first.")
    eos = tokenizer.eos_token
    formatted_string = f"{prefix}{text}\n{json_string}{eos}"
    return {"formatted_text": formatted_string}

if 'tokenizer' in globals():
    print("Applying formatting function to the datasets...")
    train_dataset = train_data.map(format_data_for_sft, remove_columns=train_data.column_names)
    validation_dataset = validation_data.map(format_data_for_sft, remove_columns=validation_data.column_names) # Process validation data too!
    print("Dataset formatting complete.")
    print(f"Train dataset features: {train_dataset.features}")
    print(f"Validation dataset features: {validation_dataset.features}")
else:
    print("WARNING: Tokenizer not loaded yet. Re-run Cell 5 and this cell's mapping part.")
    train_dataset = None
    validation_dataset = None

Loading dataset 'iTzMiNOS/voice-orders-large-clean-12k' (split: 'train')...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

Dataset loaded: Dataset({
    features: ['transcribed_text', 'speaker', 'items'],
    num_rows: 1200
})
Splitting dataset into train and validation sets...
Train set size: 1080
Validation set size: 120


### **Small 1b Model, Large Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 64

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-small-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 64)...


Inference Batches:  50%|█████     | 1/2 [01:44<01:44, 104.63s/it]

Attempted to parse (after extraction): '[{"customizations":["Ranch","Blue Cheese"],"name":"Extra Dipping Sauce"},{"customizations":[],"name":"Fruit Salad"},{"customizations":["Apple"],"name":"Fresh Juices"},{"customizations":[],"name":"Berry Parfait"},{"customizations":[],"name":"Chocolate"},{"customizations":["Fanta"],"name":"Soft Drinks"},{"customizations":["Whiskey","Vodka","Rum"],"name":"Extra Shot of Alcohol"},{"customizations":["Greek"],"name":"Salad"},{"customizations":["Shrimp"],"name":"Cocktail"]'
Attempted to parse (after extraction): '[{"customizations":["IPA","Craft Beer"],"name":"Craft Beer"},{"customizations":["Vegetarian","Chicken"],"name":"Spring Rolls"},{"customizations":["Mild","Spicy"],"name":"Coleslaw"},{"customizations":["Ranch","BBQ"],"name":"Onion Rings"},{"customizations":["Garlic Soy","Teriyaki","Peanut Sauce"],"name":"Vegetable Stir-fry"},{"customizations":["Chopped Parsley"],"name":"Cream of Mushroom"},{"customizations":["Ranch","Blue Cheese"],"name":"Extra D

Inference Batches: 100%|██████████| 2/2 [03:28<00:00, 104.03s/it]

Attempted to parse (after extraction): '[{"customizations":["Lemon Wedge","Fresh Parsley","Ranch"],"name":"Chicken Soup"},{"customizations":["Ranch","BBQ Sauce"],"name":"Onion Rings"},{"customizations":["Blue Cheese","Ranch"],"name":"Extra Dipping Sauce"},{"customizations":["Carrot","Apple","Orange"],"name":"Fresh Juices"},{"customizations":["Falafel","Grilled Chicken"],"name":"Greek Salad"},{"customizations":["Lager","IPA"],"name":"Craft Beer"},{"customizations":["Crispy"],"name":"Fish and Chips"},{"customizations":["Lemon Herb","Spicy Cocktail Sauce"],"name":"Shrimp Cocktail"]'
Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.4988
✅ Exact Match Accuracy: 2.50%

--- Low Similarity Examples (< 0.8) ---





                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [None]:
pip install -U transformers peft bitsandbytes accelerate



### **Medium 1b Model, Large Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-1b-it"
adapter_model_id = "iTzMiNOS/gemma-3-1b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-1b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-1b-it) with quantization...
Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-1b-it-medium-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...
LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches:  25%|██▌       | 1/4 [00:24<01:13, 24.52s/it]

Attempted to parse (after extraction): '[{"customizations":["BBQ Sauce","Gravy","Pesto"],"name":"Extra Sauce"},{"customizations":["Marinara"],"name":"Mozzarella Sticks"},{"customizations":["Toasted Sesame Seeds"],"name":"Miso Soup"},{"customizations":[],"name":"Vanilla"},{"customizations":[],"name":"Mint Chocolate Chip"},{"customizations":["Ranch","BBQ Sauce"],"name":"Onion Rings"},{"customizations":["Chocolate Chips","Berries"],"name":"Extra Topping"},{"customizations":["Cheddar Cheese","Croutons","Basil Pesto"],"name":"Tomato Soup"},{"customConvertToJSONStringArray(
{"customizations":["Garlic Butter"],"name":"Grilled Chicken Breast"},{"customizations":["Fries","Veggies"],"name":"Side"}]'


Inference Batches: 100%|██████████| 4/4 [01:22<00:00, 20.62s/it]

Attempted to parse (after extraction): '[{"customizations":["Cheesecake","Caesar Salad","Grilled Shrimp","Grilled Chicken","Chicken Soup","Fresh Parsley","Lemon Wedge"],"name":"Cheesecake"},{"customizations":["Grilled Shrimp","Grilled Chicken"],"name":"Caesar Salad"},{"customizations":["Fresh Parsley","Lemon Wedge"],"name":"Chicken Soup"},{"customizations":["Nuts","Chocolate Chips"],"name":"Extra Topping"},{"customizations":[],"name":"Extra Ice Cream Scoop"},{"customizations":[],"name":"Extra Bun or Bread Roll"},{"customizations":["Oat Milk","Almond Milk"],"name":"Milk Alternatives"},{"customConvertToJSONStringFormat("{json}")": "Cheesecake, Caesar Salad, Grilled Shrimp, Grilled Chicken, Chicken Soup, Fresh Parsley, Lemon Wedge"},{"customizations":["Grilled Shrimp","Grilled Chicken"],"name":"Caesar Salad"},{"customizations":["Fresh Parsley","Lemon Wedge"],"name":"Chicken Soup"},{"customizations":["Nuts","Chocolate Chips"],"name":"Extra Topping"},{"customizations":[],"name":"Extra Ice C




                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               transcribed_text                                                                                                                                                                                                                                                                                                                                                                                                                                         

### **Small 4b Model, Large Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-small-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [01:21<00:00, 20.28s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.8598
✅ Exact Match Accuracy: 20.00%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                          




### **Medium 4b Model, Large Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-4b-it"
adapter_model_id = "iTzMiNOS/gemma-3-4b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-4b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-4b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-4b-it-medium-json-LoRA-4bit-128-16) onto the base model...


adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches:  75%|███████▌  | 3/4 [01:01<00:20, 20.32s/it]

Attempted to parse (after extraction): '[{"customizations":["Teriyaki","Peanut Sauce"],"name":"Vegetable Stir-fry"},{""customizations":["BBQ"],"name":"Extra Sauce"},{""customizations":["BBQ"],"name":"Onion Rings"},{""customizations":["Truffle Oil"],"name":"Cream of Mushroom"},{""customizations":["Rum","Vodka","Whiskey"],"name":"Extra Shot of Alcohol"},{""customizations":["Garlic Butter","Lemon Dill"],"name":"Grilled Salmon"},{""customizations":["Sweetened"],"name":"Iced Tea"},{""customizations":[],"name":"Extra Whipped Cream"}]'


Inference Batches: 100%|██████████| 4/4 [01:21<00:00, 20.43s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9019
✅ Exact Match Accuracy: 34.17%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  transcribed_text                                                                                                                                                                                                                                                                                                                                                    




### **Small 12b Model, Large Dataset**

In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-small-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-small-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [08:36<00:00, 129.19s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9106
✅ Exact Match Accuracy: 33.33%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                          




### **Medium 12b Model, Large Dataset**


In [None]:
# Cell: Inference and Evaluation (Batched & GPU Ensured & Fence Cleaning)

import json
import pandas as pd
from datasets import Dataset
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    logging
)
from peft import PeftModel
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from difflib import SequenceMatcher
import torch
import gc
import os
import math
import numpy as np
import re # Import regex for cleaning fences

# --- Configuration ---
base_model_id = "google/gemma-3-12b-it"
adapter_model_id = "iTzMiNOS/gemma-3-12b-it-medium-json-LoRA-4bit-128-16"
prefix = """Convert the following food order into JSON format using this structure:
[
  { "customizations": ["Customization 1", "Customization 2"], "name": "Item Name" },
  { "customizations": [], "name": "Another Item" }
]:
"""
inference_batch_size = 32

# --- Check GPU Availability & L4 ---
if not torch.cuda.is_available():
    raise SystemError("CUDA is not available. This script requires a GPU.")
else:
    device_name = torch.cuda.get_device_name(0)
    print(f"CUDA is available. Using GPU: {device_name}")
    if "L4" not in device_name:
         print("Warning: The detected GPU is not an L4. Performance may vary.")
    device = 0

# --- Memory Cleanup ---
print("Cleaning up memory before loading...")
gc.collect()
torch.cuda.empty_cache()
print("CUDA cache cleared.")

# --- Setup Quantization ---
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(device)[0] >= 8 else torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
print(f"Using compute dtype: {compute_dtype} for BnB config.")

# --- Load Tokenizer ---
print(f"Loading tokenizer from {base_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
eos_token = tokenizer.eos_token
assert tokenizer.pad_token_id is not None, "Tokenizer pad_token_id is not set!"
print("Tokenizer loaded.")

# --- Load Base Model with Quantization ---
print(f"Loading base model ({base_model_id}) with quantization...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
)
print(f"Base model loaded. Device map: {model.hf_device_map}")

# --- Load LoRA Adapter ---
print(f"Loading LoRA adapter ({adapter_model_id}) onto the base model...")
try:
    model = PeftModel.from_pretrained(model, adapter_model_id)
    print("LoRA adapter loaded successfully.")
    print("Attempting to merge LoRA adapter...")
    try:
        model = model.merge_and_unload()
        print("LoRA adapter merged successfully.")
    except Exception as e:
        print(f"⚠️ Could not merge LoRA adapter: {e}. Proceeding with PEFT model.")
except Exception as e:
     print(f"❌ Failed to load LoRA adapter: {e}")
     raise e

# --- Build the Inference Pipeline ---
logging.set_verbosity(logging.CRITICAL)
print("Building text-generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
if hasattr(pipe, 'device'): print(f"Pipeline device: {pipe.device}")
else: print("Pipeline device managed by model's device_map.")

# --- Load Validation Data ---
if 'validation_data' not in globals():
     print("validation_data not found, attempting reload...")
     from datasets import load_dataset
     dataset_id = "iTzMiNOS/voice-orders-small-clean-12k"
     split_name = "train"
     dataset = load_dataset(dataset_id, split=split_name)
     columns_to_keep = ["transcribed_text", "items", "speaker"]
     dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
     dataset = dataset.select(range(min(1200, len(dataset))))
     train_val_split = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
     validation_data = train_val_split['test']
     print("Reloaded and split dataset.")

df = validation_data.to_pandas()
print(f"Loaded validation data with {len(df)} rows.")

# --- Convert Numpy arrays in 'items' column ---
def deep_convert(obj):
    if isinstance(obj, dict):
        return {k: deep_convert(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [deep_convert(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return deep_convert(obj.tolist())
    else:
        return obj

df['items'] = df['items'].apply(deep_convert)
print("Conversion complete.")


# --- ***** UPDATED: Function for Robust JSON Parsing (Handles Fences) ***** ---
def parse_json_robustly(generated_text):
    """Attempts to extract and parse JSON, handling optional markdown fences."""
    json_str = None
    try:
        # Find the start of the first list or object
        first_bracket = generated_text.find('[')
        first_brace = generated_text.find('{')

        start_index = -1
        if first_bracket != -1 and (first_brace == -1 or first_bracket < first_brace):
            start_index = first_bracket
            start_char = '['
            end_char = ']'
        elif first_brace != -1:
            start_index = first_brace
            start_char = '{'
            end_char = '}'
        else:
             # Neither bracket nor brace found - maybe it's ONLY fences?
             # Try finding fences directly if no brace/bracket
             fence_match = re.search(r"`{3}(json)?\s*([\[\{])", generated_text)
             if fence_match:
                 start_index = fence_match.end() -1 # Start at the brace/bracket
                 start_char = fence_match.group(2)
                 end_char = ']' if start_char == '[' else '}'
             else:
                # Give up if no structure found
                # print(f"Debug: No JSON start ('[' or '{{') found. Output: {generated_text}")
                return None

        # Find the corresponding closing character using balancing
        open_count = 0
        end_index = -1
        # Check if start_index is valid before proceeding
        if start_index >= 0 and start_index < len(generated_text):
            for i in range(start_index, len(generated_text)):
                if generated_text[i] == start_char:
                    open_count += 1
                elif generated_text[i] == end_char:
                    open_count -= 1
                if open_count == 0:
                    end_index = i
                    break
        else:
             # Handle invalid start_index if fence logic above failed unusually
             print(f"Debug: Invalid start_index {start_index}. Output: {generated_text}")
             return None


        if end_index == -1:
            # print(f"Debug: No matching closing bracket/brace. Output: {generated_text}")
            return None

        # Extract the potential JSON substring
        json_str = generated_text[start_index : end_index + 1]

        # Clean leading/trailing whitespace that might remain
        json_str = json_str.strip()

        # --- Attempt to parse the extracted & cleaned string ---
        json_data = json.loads(json_str)
        return json_data

    except json.JSONDecodeError as e:
        # Add logging for parse failures, include the string attempted
        print(f"Warning: Could not parse JSON: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
    except Exception as e:
        # Catch any other unexpected errors during parsing
        print(f"Warning: Unexpected error during JSON parsing: {e}.")
        print(f"Attempted to parse (after extraction): '{json_str}'")
        # print(f"Original Generated Text: {generated_text}") # Uncomment for deeper debugging
        return None
# --- End Updated Function ---


# --- Apply Inference with Batching (Unchanged) ---
print(f"Running batched inference (batch size: {inference_batch_size})...")
all_prompts = [f"{prefix}{text}" for text in df['transcribed_text']]
all_results = []
num_batches = math.ceil(len(all_prompts) / inference_batch_size)
for i in tqdm(range(0, len(all_prompts), inference_batch_size), desc="Inference Batches", total=num_batches):
    batch_prompts = all_prompts[i:i+inference_batch_size]
    try:
        batch_outputs = pipe(batch_prompts, max_new_tokens=1024, return_full_text=False, pad_token_id=tokenizer.eos_token_id, batch_size=len(batch_prompts))
        for output_list in batch_outputs:
            if output_list and isinstance(output_list, list):
                 generated_text = output_list[0]["generated_text"].strip()
                 parsed_json = parse_json_robustly(generated_text) # Use updated parser
                 all_results.append(parsed_json)
            else: print(f"Warning: Unexpected output format: {output_list}"); all_results.append(None)
    except Exception as e:
        print(f"\n--- ERROR during batch {i // inference_batch_size + 1} --- Error: {e}")
        all_results.extend([None] * len(batch_prompts))
if len(all_results) != len(all_prompts):
     print(f"Warning: Result count mismatch! Padding with None.")
     all_results.extend([None] * (len(all_prompts) - len(all_results)))
df['predicted_items'] = all_results
print("Inference complete.")

def to_lower(obj):
    if isinstance(obj, str):
        return obj.lower()  # Convert strings to lowercase
    elif isinstance(obj, dict):
        return {k: to_lower(v) for k, v in obj.items()}  # Apply recursively for dictionaries
    elif isinstance(obj, list):
        return [to_lower(v) for v in obj]  # Apply recursively for lists
    else:
        return obj

# --- Comparison Metric (Unchanged) ---
def similarity_score(pred, target):
    if pred is None or target is None:
        return 0.0
    try:
        # Convert both the prediction and target to lowercase
        pred = to_lower(pred)
        target = to_lower(target)

        # Convert the structures into strings
        pred_str = json.dumps(pred, sort_keys=True, separators=(',', ':'))
        target_str = json.dumps(target, sort_keys=True, separators=(',', ':'))

        return SequenceMatcher(None, pred_str, target_str).ratio()
    except Exception as e:
        print(f"Error calculating similarity: Pred={pred}, Target={target}, Error={e}")
        return 0.0
# --- Calculate Metrics (Unchanged) ---
print("Calculating metrics...")
df['similarity'] = df.apply(lambda row: similarity_score(row['predicted_items'], row['items']), axis=1)
df['exact_match'] = df.apply(lambda row:
                             row['predicted_items'] is not None and
                             row['items'] is not None and
                             to_lower(row['predicted_items']) == to_lower(row['items']),
                             axis=1)

average_similarity = df['similarity'].mean()
exact_match_accuracy = df['exact_match'].mean()

print("\n--- Evaluation Results ---")
print(f"🔍 Average Similarity Score: {average_similarity:.4f}")
print(f"✅ Exact Match Accuracy: {exact_match_accuracy:.2%}")

# --- Display Mismatches (Unchanged) ---
print("\n--- Low Similarity Examples (< 0.8) ---")
low_sim_df = df[df['similarity'] < 0.8][['transcribed_text', 'items', 'predicted_items', 'similarity']]
print(low_sim_df.to_string())

CUDA is available. Using GPU: NVIDIA A100-SXM4-40GB
Cleaning up memory before loading...
CUDA cache cleared.
Using compute dtype: torch.bfloat16 for BnB config.
Loading tokenizer from google/gemma-3-12b-it...
Tokenizer loaded.
Loading base model (google/gemma-3-12b-it) with quantization...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Base model loaded. Device map: {'': 0}
Loading LoRA adapter (iTzMiNOS/gemma-3-12b-it-medium-json-LoRA-4bit-128-16) onto the base model...
LoRA adapter loaded successfully.
Attempting to merge LoRA adapter...




LoRA adapter merged successfully.
Building text-generation pipeline...
Pipeline device: cuda:0
Loaded validation data with 120 rows.
Conversion complete.
Running batched inference (batch size: 32)...


Inference Batches: 100%|██████████| 4/4 [10:00<00:00, 150.18s/it]

Inference complete.
Calculating metrics...

--- Evaluation Results ---
🔍 Average Similarity Score: 0.9411
✅ Exact Match Accuracy: 44.17%

--- Low Similarity Examples (< 0.8) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                            transcribed_text                                                                                                                                                                                                                                                                                                                                                          


