# Evaluating Llama-3.2-1B on Multiple Datasets

This notebook evaluates the Llama-3.2-1B model on multiple datasets:
- **VitaminC**: Fact verification
- **FEVER/FEVEROUS**: Fact verification
- **HotpotQA/2WikiMultihopQA**: Multi-hop question answering
- **SVAMP**: Math word problems
- **Bamboogle**: General question answering

The evaluation uses dataset-specific prompting strategies and metrics.

In [None]:
import json
import time
import re
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import torch
import math

# ANSI color codes for terminal output
BLUE = '\033[94m'     # For sample information
GREEN = '\033[92m'    # For correct predictions and success messages
RED = '\033[91m'      # For incorrect predictions
YELLOW = '\033[93m'   # For predictions and headers
CYAN = '\033[96m'     # For progress information
PURPLE = '\033[95m'   # For model responses
BOLD = '\033[1m'      # Bold text
ENDC = '\033[0m'      # End color

# Mount Google Drive
drive.mount('/content/drive')

## Dataset Loading Functions

Functions to load different dataset formats (JSON and JSONL)

In [None]:
def load_dataset(file_path, dataset_name, limit=150):
    """Load samples from various datasets with appropriate format handling"""
    print(f"{CYAN}Loading {dataset_name} dataset from {file_path}...{ENDC}")
    data = []

    if not os.path.exists(file_path):
        print(f"{RED}File not found: {file_path}{ENDC}")
        return []

    if file_path.endswith('.jsonl'):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():  # Skip empty lines
                    data.append(json.loads(line))
                    if len(data) >= limit:
                        break
    else:  # .json files
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            if isinstance(json_data, list):
                data = json_data[:limit]
            else:
                # Handle nested structures if needed
                if 'data' in json_data:
                    data = json_data['data'][:limit]
                else:
                    print(f"{YELLOW}Warning: Unexpected JSON structure in {file_path}{ENDC}")
                    data = [json_data]  # Just use the whole object as one sample

    print(f"{CYAN}Loaded {len(data)} samples from {dataset_name}.{ENDC}")
    return data

## Prompt Creation Functions

Different prompting strategies for each dataset type

In [None]:
def create_prompt(sample, dataset_name):
    """Create appropriate prompts based on dataset type"""

    if dataset_name == "VitaminC" or dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        # Fact verification datasets
        if dataset_name == "VitaminC":
            claim = sample["claim"]
            evidence = sample["evidence"]
        elif dataset_name == "FEVER":
            claim = sample["claim"]
            evidence = sample.get("evidence", "") or sample.get("context", "")
        elif dataset_name == "FEVEROUS":
            claim = sample["claim"]
            evidence = sample.get("evidence", "") or sample.get("context", "")

        return create_fact_verification_prompt(claim, evidence)

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA":
        # Multi-hop QA datasets
        question = sample.get("question", "") or sample.get("query", "")
        context = sample.get("context", "")
        if not context and "original_context" in sample:
            context = sample["original_context"]

        return create_qa_prompt(question, context)

    elif dataset_name == "SVAMP":
        # Math word problem dataset
        question = sample.get("question", "") or sample.get("body", "")
        return create_math_prompt(question)

    elif dataset_name == "Bamboogle":
        # Bamboogle dataset - assume it's a QA task
        question = sample.get("question", "")
        context = sample.get("context", "")

        if not question and "answer" in sample:
            # If no question is provided but there's an answer, create a generic prompt
            return f"Please provide a detailed answer for this question.\n\nQuestion: What is the answer?\n\nAnswer:"

        return create_qa_prompt(question, context)

    else:
        # Generic prompt for unknown datasets
        return f"Please analyze this data and provide a detailed response:\n\n{json.dumps(sample, indent=2)}"

def create_fact_verification_prompt(claim, evidence):
    """Create a chain of thought prompt for fact checking"""
    prompt = f"""You are a fact-checking AI. Determine if the evidence SUPPORTS, REFUTES, or provides NOT ENOUGH INFO for the claim.

Claim: "{claim}"
Evidence: "{evidence}"

Let's think through this step-by-step:

1) First, I'll analyze what the claim is stating:
   - What is the main assertion?
   - Are there specific details, numbers, or comparisons?

2) Next, I'll examine what the evidence contains:
   - What information is provided in the evidence?
   - Are there specific facts, figures, or statements?

3) Now, I'll compare the claim with the evidence:
   - Does the evidence directly address the claim?
   - Is there alignment or contradiction between claim and evidence?
   - Is any critical information missing from the evidence needed to verify the claim?

4) Finally, I'll determine my verdict with reasoning:
   - SUPPORTS: Evidence confirms the claim is true
   - REFUTES: Evidence contradicts the claim
   - NOT ENOUGH INFO: Evidence is insufficient to either support or refute the claim

My detailed analysis:"""
    return prompt

def create_qa_prompt(question, context):
    """Create a prompt for question answering tasks"""
    prompt = f"""You are an AI assistant that helps with question answering. Read the context provided and answer the question.

Context:
{context}

Question: {question}

Let's think step-by-step to find the answer based on the context:

1) First, I'll identify what the question is asking for.

2) Next, I'll search the context for relevant information related to the question.

3) Then, I'll reason about the information to determine the answer.

4) Finally, I'll provide a concise and direct answer to the question.

My reasoning:"""
    return prompt

def create_math_prompt(question):
    """Create a prompt for math word problems"""
    prompt = f"""You are an AI assistant that helps solve math word problems. Solve the following problem step-by-step.

Problem: {question}

I'll solve this step-by-step:

1) First, I'll identify the key information in the problem.

2) Next, I'll determine which mathematical operations are needed.

3) Then, I'll set up and solve the equations.

4) Finally, I'll provide the numeric answer.

My solution:"""
    return prompt

## Model Inference Function

In [None]:
def run_inference(model, tokenizer, prompt, device="cuda"):
    """Run inference on the model"""
    # Tokenize input with attention mask
    encoding = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128000)

    inputs = {
        'input_ids': encoding.input_ids.to(device),
        'attention_mask': encoding.attention_mask.to(device)
    }

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=2048,  # Adjust based on desired output length
            temperature=0.2,      # Lower temperature for focused output
            do_sample=True,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


## Prediction Extraction Functions

Functions to extract structured predictions from model outputs

In [None]:
def extract_prediction(output, dataset_name, sample=None):
    """Extract the prediction from the model's output based on dataset type"""

    if dataset_name == "VitaminC" or dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        # Fact verification prediction extraction
        return extract_fact_verification_prediction(output)

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA":
        # QA prediction extraction - extract the answer
        return extract_qa_prediction(output)

    elif dataset_name == "SVAMP":
        # Math prediction - extract numeric answer
        return extract_math_prediction(output)

    elif dataset_name == "Bamboogle":
        # Extract the answer for Bamboogle
        return extract_qa_prediction(output)

    else:
        # Generic extraction for unknown datasets
        return output.strip()

def extract_fact_verification_prediction(output):
    """Extract fact verification label from model output"""
    output = output.lower()

    # First check for a clear final verdict statement
    verdict_patterns = [
        r"verdict\s*:?\s*(supports|refutes|not enough info)",
        r"therefore,?\s+(the evidence)?\s*(supports|refutes|provides not enough info)",
        r"(my conclusion|my answer) is\s*:?\s*(supports|refutes|not enough info)",
        r"the evidence (supports|refutes|provides not enough info)",
        r"(supports|refutes|not enough info) the claim"
    ]

    for pattern in verdict_patterns:
        matches = re.findall(pattern, output)
        if matches:
            last_match = matches[-1]
            if isinstance(last_match, tuple):
                last_match = last_match[-1]  # Get the last group

            if "not enough" in last_match:
                return "NOT ENOUGH INFO"
            elif "supports" in last_match:
                return "SUPPORTS"
            elif "refutes" in last_match:
                return "REFUTES"

    # If no clear verdict statement, look for the labels in the last quarter of text
    last_quarter = output[3*len(output)//4:]

    if "not enough info" in last_quarter:
        return "NOT ENOUGH INFO"
    if "supports" in last_quarter:
        return "SUPPORTS"
    if "refutes" in last_quarter:
        return "REFUTES"

    # If still no verdict found, check the full text
    if "not enough info" in output:
        return "NOT ENOUGH INFO"
    if "supports" in output:
        return "SUPPORTS"
    if "refutes" in output:
        return "REFUTES"

    # Default
    return "NOT ENOUGH INFO"

def extract_qa_prediction(output):
    """Extract the answer from QA model output"""
    # Look for patterns like "Answer: X", "The answer is X", etc.
    answer_patterns = [
        r"(?:answer|the answer)(?:\s+is)?(?:\s*:)?\s*(?:is)?\s*(.*?)(?:$|\.|\n)",
        r"(?:therefore|thus|so|hence),?\s*(?:the answer|answer)(?:\s+is)?(?:\s*:)?\s*(.*?)(?:$|\.|\n)",
        r"(?:final answer|in conclusion)(?:\s*:)?\s*(.*?)(?:$|\.|\n)"
    ]

    for pattern in answer_patterns:
        matches = re.search(pattern, output.lower())
        if matches:
            return matches.group(1).strip()

    # If no pattern matches, return the last non-empty line as a fallback
    lines = [line.strip() for line in output.split('\n') if line.strip()]
    if lines:
        return lines[-1]

    return output.strip()

def extract_math_prediction(output):
    """Extract numeric answer from math problem solution"""
    # Look for numbers in the last lines of the output, focusing on final answer patterns
    answer_patterns = [
        r"(?:answer|the answer|result|the result)(?:\s+is)?(?:\s*:)?\s*(-?[\d,]+\.?\d*)",
        r"(?:=|equals)\s*(-?[\d,]+\.?\d*)",
        r"(?:therefore|thus|so|hence),?\s*(?:the answer|answer)(?:\s+is)?(?:\s*:)?\s*(-?[\d,]+\.?\d*)",
        r"(?:final answer|in conclusion)(?:\s*:)?\s*(-?[\d,]+\.?\d*)"
    ]

    for pattern in answer_patterns:
        matches = re.search(pattern, output.lower())
        if matches:
            # Clean the number format (remove commas, etc.)
            num_str = matches.group(1).replace(',', '')
            try:
                return float(num_str)
            except ValueError:
                pass

    # Fall back to looking for any number in the last few lines
    lines = output.split('\n')[-5:]  # Look at last 5 lines
    for line in reversed(lines):
        nums = re.findall(r"(-?[\d,]+\.?\d*)", line)
        for num in nums:
            try:
                return float(num.replace(',', ''))
            except ValueError:
                pass

    return output.strip()

## Evaluation Metrics Functions

Functions to compare predictions with ground truth

In [None]:
def evaluate_correctness(prediction, ground_truth, dataset_name):
    """Evaluate if the prediction is correct based on dataset type"""

    if dataset_name == "VitaminC" or dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        # Fact verification - direct comparison
        return prediction == ground_truth

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA" or dataset_name == "Bamboogle":
        # QA evaluation - normalize and compare
        return normalize_qa_answers(prediction, ground_truth)

    elif dataset_name == "SVAMP":
        # Math evaluation - numeric comparison
        return evaluate_math_correctness(prediction, ground_truth)

    else:
        # Generic comparison for unknown datasets
        return prediction == ground_truth

def normalize_qa_answers(prediction, ground_truth):
    """Normalize and compare QA answers with flexible matching"""
    if not prediction or not ground_truth:
        return False

    # Handle list or dictionary ground truths
    if isinstance(ground_truth, list):
        ground_truth = " ".join([str(item) for item in ground_truth])
    elif isinstance(ground_truth, dict):
        if "answer" in ground_truth:
            ground_truth = ground_truth["answer"]
        else:
            ground_truth = str(ground_truth)

    # Normalize both strings
    pred_norm = prediction.lower().strip()
    truth_norm = str(ground_truth).lower().strip()

    # Remove punctuation and extra spaces
    pred_norm = re.sub(r'[^\w\s]', '', pred_norm).strip()
    truth_norm = re.sub(r'[^\w\s]', '', truth_norm).strip()

    # Check if prediction contains ground truth or vice versa
    return pred_norm in truth_norm or truth_norm in pred_norm

def evaluate_math_correctness(prediction, ground_truth):
    """Evaluate correctness of math answers with tolerance"""
    try:
        # Convert to numeric values
        if isinstance(prediction, str):
            prediction = float(re.search(r'(-?[\d.]+)', prediction.replace(',', '')).group(1))

        if isinstance(ground_truth, str):
            ground_truth = float(re.search(r'(-?[\d.]+)', ground_truth.replace(',', '')).group(1))

        # Compare with tolerance
        tolerance = 0.01
        return abs(float(prediction) - float(ground_truth)) < tolerance
    except (ValueError, TypeError, AttributeError):
        return False

def get_ground_truth(sample, dataset_name):
    """Extract ground truth from sample based on dataset type"""

    if dataset_name == "VitaminC":
        return sample.get("label", "")

    elif dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        return sample.get("label", "")

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA":
        return sample.get("answer", "")

    elif dataset_name == "SVAMP":
        return sample.get("answer", None)

    elif dataset_name == "Bamboogle":
        return sample.get("answer", "")

    else:
        return None

## Visualization Functions

Functions to create visualizations of results

In [None]:
def visualize_results(results_df, dataset_name):
    """Create visualizations of results for a specific dataset"""
    output_dir = f"results_{dataset_name}"
    os.makedirs(output_dir, exist_ok=True)

    if 'true_label' in results_df.columns and 'prediction' in results_df.columns:
        # Fact verification datasets
        if len(results_df) > 0:
            try:
                labels = sorted(list(set(results_df['true_label'].unique()).union(set(results_df['prediction'].unique()))))

                # Create confusion matrix
                cm = confusion_matrix(
                    results_df['true_label'],
                    results_df['prediction'],
                    labels=labels
                )

                plt.figure(figsize=(10, 8))
                sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
                plt.xlabel('Predicted')
                plt.ylabel('True')
                plt.title(f'Confusion Matrix - {dataset_name}')
                plt.tight_layout()
                plt.savefig(f"{output_dir}/confusion_matrix.png")
                plt.close()

                # Class-wise accuracy
                class_accuracy = {}
                for cls in labels:
                    cls_indices = results_df['true_label'] == cls
                    if cls_indices.any():
                        correct = (results_df.loc[cls_indices, 'prediction'] == cls).sum()
                        class_accuracy[cls] = correct / cls_indices.sum()

                plt.figure(figsize=(10, 6))
                sns.barplot(x=list(class_accuracy.keys()), y=list(class_accuracy.values()))
                plt.ylim(0, 1)
                plt.title(f'Class-wise Accuracy - {dataset_name}')
                plt.ylabel('Accuracy')
                plt.tight_layout()
                plt.savefig(f"{output_dir}/class_accuracy.png")
                plt.close()
            except Exception as e:
                print(f"{RED}Error creating visualizations for {dataset_name}: {e}{ENDC}")

    # Overall accuracy
    plt.figure(figsize=(6, 4))
    accuracy = results_df['correct'].mean()
    plt.bar(['Accuracy'], [accuracy])
    plt.ylim(0, 1)
    plt.title(f'Overall Accuracy - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/overall_accuracy.png")
    plt.close()

    # Save sample correct/incorrect examples
    correct_samples = results_df[results_df['correct']].head(3)
    incorrect_samples = results_df[~results_df['correct']].head(3)

    with open(f"{output_dir}/sample_results.txt", 'w') as f:
        f.write(f"=== Sample Correct Examples ({dataset_name}) ===\n\n")
        for _, sample in correct_samples.iterrows():
            f.write(f"Input: {sample.get('input', '')}\n")
            f.write(f"True: {sample.get('true_label', '')}\n")
            f.write(f"Prediction: {sample.get('prediction', '')}\n")
            f.write("\n---\n\n")

        f.write(f"=== Sample Incorrect Examples ({dataset_name}) ===\n\n")
        for _, sample in incorrect_samples.iterrows():
            f.write(f"Input: {sample.get('input', '')}\n")
            f.write(f"True: {sample.get('true_label', '')}\n")
            f.write(f"Prediction: {sample.get('prediction', '')}\n")
            f.write("\n---\n\n")

## Dataset Evaluation Function

Function to evaluate model on each dataset

In [None]:
def evaluate_dataset(model, tokenizer, data, dataset_name):
    """Evaluate model on a specific dataset"""
    print(f"{BOLD}{CYAN}Starting evaluation of {dataset_name} dataset...{ENDC}")

    results = []
    start_time = time.time()

    for i, sample in enumerate(data):
        # Create appropriate prompt
        prompt = create_prompt(sample, dataset_name)

        # Print the prompt being sent to the model
        print(f"\n{BOLD}{'='*80}{ENDC}")
        print(f"{BLUE}{BOLD}SAMPLE {i+1}/{len(data)} - DATASET: {dataset_name}{ENDC}")
        print(f"{BLUE}{'-'*80}{ENDC}")
        print(f"{GREEN}PROMPT:{ENDC}")
        print(f"{GREEN}{prompt}{ENDC}")
        print(f"{BLUE}{'-'*80}{ENDC}")

        # Run inference
        output = run_inference(model, tokenizer, prompt)

        # Print the model's response
        print(f"{PURPLE}MODEL RESPONSE:{ENDC}")
        print(f"{PURPLE}{output}{ENDC}")

        # Extract prediction and ground truth
        prediction = extract_prediction(output, dataset_name, sample)
        true_label = get_ground_truth(sample, dataset_name)

        # Evaluate correctness
        correct = evaluate_correctness(prediction, true_label, dataset_name)
        correct_color = GREEN if correct else RED

        print(f"{BLUE}{'-'*80}{ENDC}")
        print(f"{YELLOW}PREDICTION: {prediction}{ENDC}")
        print(f"{YELLOW}TRUE LABEL: {true_label}{ENDC}")
        print(f"{correct_color}CORRECT: {correct}{ENDC}")
        print(f"{BOLD}{'='*80}{ENDC}")

        # Store result
        result = {
            "input": prompt,
            "output": output,
            "prediction": prediction,
            "true_label": true_label,
            "correct": correct
        }
        results.append(result)

        # Print progress
        if (i+1) % 10 == 0 or i == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / (i+1)
            remaining = avg_time * (len(data) - i - 1)
            print(f"{CYAN}Processed {i+1}/{len(data)} samples - "
                  f"Avg time per sample: {avg_time:.2f}s - "
                  f"Estimated time remaining: {remaining/60:.1f} minutes{ENDC}")

    # Calculate overall accuracy
    results_df = pd.DataFrame(results)
    accuracy = results_df['correct'].mean()

    print(f"\n{BOLD}Results for {dataset_name}:{ENDC}")
    print(f"{BOLD}Overall accuracy: {accuracy:.2f}{ENDC}")

    # Save results
    output_dir = f"results_{dataset_name}"
    os.makedirs(output_dir, exist_ok=True)
    results_df.to_csv(f"{output_dir}/results.csv", index=False)

    # Create visualizations
    visualize_results(results_df, dataset_name)

    print(f"{GREEN}Results saved to {output_dir}/{ENDC}")

    return accuracy, results_df

## Model Loading and Dataset Evaluation

Load the model and evaluate it on all datasets

In [None]:
# Load model and tokenizer
print("Loading model and tokenizer...")
model_path = "/content/drive/Shareddrives/517 nlp project/Models/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

# Ensure the tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

print(f"{GREEN}Model loaded successfully{ENDC}")

In [None]:
# Define dataset paths
datasets = {
    "VitaminC": "/content/drive/Shareddrives/517 nlp project/data/VitaminC/vitaminc/test.jsonl",
    "2WikiMultihopQA": "/content/drive/Shareddrives/517 nlp project/data/2WikiMultihopQA/test.json",
    "Bamboogle": "/content/drive/Shareddrives/517 nlp project/data/Bamboogle/test.json",
    "FEVER": "/content/drive/Shareddrives/517 nlp project/data/FEVER/fever_test.jsonl",
    "FEVEROUS": "/content/drive/Shareddrives/517 nlp project/data/FEVEROUS/feverous_test.jsonl",
    "HotpotQA": "/content/drive/Shareddrives/517 nlp project/data/HotpotQA/test.json",
    "SVAMP": "/content/drive/Shareddrives/517 nlp project/data/SVAMP/test.json"
}

# Display available datasets
print(f"{CYAN}Available datasets for evaluation:{ENDC}")
for i, dataset_name in enumerate(datasets.keys()):
    print(f"{i+1}. {dataset_name}")

Test a single sample for all the datasets to ensure everything is working first

In [None]:
# Choose which datasets to evaluate
datasets_to_evaluate = [
    "VitaminC",
    "FEVER",
    "FEVEROUS",
    "HotpotQA",
    "2WikiMultihopQA",
    "SVAMP",
    "Bamboogle"
]  # Evaluate all available datasets

# Initialize results summary
summary = {}

# Process each selected dataset
for dataset_name in datasets_to_evaluate:
    if dataset_name not in datasets:
        print(f"{RED}Dataset {dataset_name} not found in available datasets{ENDC}")
        continue

    # Load dataset
    file_path = datasets[dataset_name]
    data = load_dataset(file_path, dataset_name, limit=1)  # Limit to 1 sample

    if data:
        # Evaluate on the dataset
        accuracy, _ = evaluate_dataset(model, tokenizer, data, dataset_name)
        summary[dataset_name] = accuracy

In [None]:
# Choose which datasets to evaluate
# You can modify this list to evaluate specific datasets
# datasets_to_evaluate = ["VitaminC", "FEVER"]  # Uncomment to evaluate specific datasets
datasets_to_evaluate = [
    "VitaminC",
    # "FEVER",
    # "FEVEROUS",
    # "HotpotQA",
    # "2WikiMultihopQA",
    # "SVAMP",
    # "Bamboogle"
]  # Evaluate all available datasets
# Initialize results summary
summary = {}

# Process each selected dataset
for dataset_name in datasets_to_evaluate:
    if dataset_name not in datasets:
        print(f"{RED}Dataset {dataset_name} not found in available datasets{ENDC}")
        continue

    # Load dataset
    file_path = datasets[dataset_name]
    data = load_dataset(file_path, dataset_name, limit=150)  # Limit to 150 samples

    if data:
        # Evaluate on the dataset
        accuracy, _ = evaluate_dataset(model, tokenizer, data, dataset_name)
        summary[dataset_name] = accuracy

## Results Visualization

Create summary visualizations of model performance across datasets

In [None]:
# Print summary of results
print(f"\n{BOLD}{CYAN}Summary of Results:{ENDC}")
for dataset, accuracy in summary.items():
    print(f"{dataset}: {accuracy:.4f}")

# Create summary visualization
if summary:
    plt.figure(figsize=(12, 6))
    datasets = list(summary.keys())
    accuracies = list(summary.values())

    # Sort by accuracy
    sorted_indices = np.argsort(accuracies)[::-1]
    sorted_datasets = [datasets[i] for i in sorted_indices]
    sorted_accuracies = [accuracies[i] for i in sorted_indices]

    bars = plt.bar(sorted_datasets, sorted_accuracies)

    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom')

    plt.ylim(0, 1.1)  # Set y limit to 0-1 with a small margin
    plt.xlabel('Datasets')
    plt.ylabel('Accuracy')
    plt.title('Llama-3.2-1B Performance Across Datasets')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("summary_results.png")
    plt.show()

    # Save summary to CSV
    summary_df = pd.DataFrame(list(summary.items()), columns=['Dataset', 'Accuracy'])
    summary_df = summary_df.sort_values('Accuracy', ascending=False)
    summary_df.to_csv("summary_results.csv", index=False)

    print(f"\n{GREEN}Summary saved to summary_results.csv and summary_results.png{ENDC}")
else:
    print(f"\n{RED}No results to visualize{ENDC}")