# Evaluating Llama-3.2-1B on Multiple Datasets

This notebook evaluates the Llama-3.2-1B model on multiple datasets:
- **VitaminC**: Fact verification
- **FEVER/FEVEROUS**: Fact verification
- **HotpotQA/2WikiMultihopQA**: Multi-hop question answering
- **SVAMP**: Math word problems
- **Bamboogle**: General question answering

The evaluation uses dataset-specific prompting strategies and metrics.

In [None]:
import json
import time
import re
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import torch
import math

# ANSI color codes for terminal output
BLUE = '\033[94m'     # For sample information
GREEN = '\033[92m'    # For correct predictions and success messages
RED = '\033[91m'      # For incorrect predictions
YELLOW = '\033[93m'   # For predictions and headers
CYAN = '\033[96m'     # For progress information
PURPLE = '\033[95m'   # For model responses
BOLD = '\033[1m'      # Bold text
ENDC = '\033[0m'      # End color

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## Dataset Loading Functions

Functions to load different dataset formats (JSON and JSONL)

In [None]:
def load_dataset(file_path, dataset_name, limit=150):
    """Load samples from various datasets with appropriate format handling"""
    print(f"{CYAN}Loading {dataset_name} dataset from {file_path}...{ENDC}")
    data = []

    if not os.path.exists(file_path):
        print(f"{RED}File not found: {file_path}{ENDC}")
        return []

    if file_path.endswith('.jsonl'):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():  # Skip empty lines
                    data.append(json.loads(line))
                    if len(data) >= limit:
                        break
    else:  # .json files
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            if isinstance(json_data, list):
                data = json_data[:limit]
            else:
                # Handle nested structures if needed
                if 'data' in json_data:
                    data = json_data['data'][:limit]
                else:
                    print(f"{YELLOW}Warning: Unexpected JSON structure in {file_path}{ENDC}")
                    data = [json_data]  # Just use the whole object as one sample

    print(f"{CYAN}Loaded {len(data)} samples from {dataset_name}.{ENDC}")
    return data

## Prompt Creation Functions

Different prompting strategies for each dataset type

In [None]:
def create_prompt(sample, dataset_name):
    """Create appropriate prompts based on dataset type"""

    if dataset_name == "VitaminC" or dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        # Fact verification datasets
        if dataset_name == "VitaminC":
            claim = sample["claim"]
            evidence = sample["evidence"]
        elif dataset_name == "FEVER":
            claim = sample["claim"]
            evidence = sample.get("evidence", "") or sample.get("context", "")
        elif dataset_name == "FEVEROUS":
            claim = sample["claim"]
            evidence = sample.get("evidence", "") or sample.get("context", "")

        return create_fact_verification_prompt(claim, evidence)

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA":
        # Multi-hop QA datasets
        question = sample.get("question", "") or sample.get("query", "")
        context = sample.get("context", "")
        if not context and "original_context" in sample:
            context = sample["original_context"]

        return create_qa_prompt(question, context)

    elif dataset_name == "SVAMP":
        # Math word problem dataset
        question = sample.get("question", "") or sample.get("body", "")
        return create_math_prompt(question)

    elif dataset_name == "Bamboogle":
        # Bamboogle dataset - assume it's a QA task
        question = sample.get("question", "")
        context = sample.get("context", "")

        if not question and "answer" in sample:
            # If no question is provided but there's an answer, create a generic prompt
            return

        return create_qa_prompt(question, context)

    else:
        # Generic prompt for unknown datasets
        return f"Please analyze this data and think step-by-step:\n\n{json.dumps(sample, indent=2)}\n\nAfter your thinking, provide your answer in a latex boxed format:\n$\\boxed{{<finalAnswer>}}$"

def create_fact_verification_prompt(claim, evidence):
    """Create a prompt for fact checking"""
    prompt = f"""Claim: {claim}

Evidence: {evidence}

Think step-by-step to determine if the evidence SUPPORTS, REFUTES, or provides NOT ENOUGH INFO for the claim.

After your thinking, end your answer with one of these in a latex boxed format:
$\\boxed{{SUPPORTS}}$ or $\\boxed{{REFUTES}}$ or $\\boxed{{NOT ENOUGH INFO}}$

Always end with your answer in a latex boxed format:
$\\boxed{{<finalAnswer>}}$"""
    return prompt

def create_qa_prompt(question, context):
    """Create a prompt for question answering tasks"""
    prompt = f"""

Question: {question}

Think step-by-step to answer the question based on the context.

After your thinking, provide your final answer in a latex boxed format:
$\\boxed{{<finalAnswer>}}$
"""
    return prompt

def create_math_prompt(question):
    """Create a prompt for math word problems"""
    prompt = f"""Problem: {question}

Think step-by-step to solve this math problem.

After your thinking, provide your final numeric answer in a latex boxed format:
$\\boxed{{<finalAnswer>}}$
"""
    return prompt


In [None]:
print( create_math_prompt("hi"))

Problem: hi

Think step-by-step to solve this math problem.

After your thinking, provide your final numeric answer in a latex boxed format:
$\boxed{<finalAnswer>}$



## Model Inference Function

In [None]:
def run_inference(model, tokenizer, prompt, device="cuda"):
    """Run inference on the model"""
    # Tokenize input with attention mask
    encoding = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128000)

    inputs = {
        'input_ids': encoding.input_ids.to(device),
        'attention_mask': encoding.attention_mask.to(device)
    }

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=2048,  # Adjust based on desired output length
            temperature=0.2,      # Lower temperature for focused output
            do_sample=True,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


## Prediction Extraction Functions

Functions to extract structured predictions from model outputs

In [None]:
import re

def extract_prediction(output, dataset_name, sample=None):
    """
    Extract the final boxed answer for any dataset.
    If we only find an empty \boxed{}, treat it as no valid box.
    """
    # For now, ignoring dataset_name / sample, just returning the last \boxed{...} if not empty
    pattern = r'\\boxed\s*\{([^}]*)\}'
    all_matches = re.findall(pattern, output)

    # Filter out empty or whitespace-only matches
    non_empty_matches = [m.strip() for m in all_matches if m.strip()]

    # If no valid non-empty box found, return None
    if not non_empty_matches:
        return None

    # Return the last valid boxed expression
    return non_empty_matches[-1]


def run_tests():
    test_cases = [
        {
            "name": "Fraction inside box - expecting \\frac{1}{2}",
            "dataset": "RandomDataset",
            "output": r"No recognized dataset... final is $\boxed{\frac{1}{2}}$",
            "expected": r"\\frac{1}{2}"
        },
        {
            "name": "No box at all (should return None)",
            "dataset": "VitaminC",
            "output": r"This text does not contain any \boxed{} expression.",
            "expected": None
        },
        {
            "name": "Box with empty content (should return None)",
            "dataset": "SVAMP",
            "output": r"Check \boxed{} here. That's all.",
            "expected": None
        },
        {
            "name": "Multiple boxes, last is fraction",
            "dataset": "HotpotQA",
            "output": (
                r"Mid: $\boxed{SUPPORTS}$ "
                r"Then final: $\boxed{\frac{2}{5}}$"
            ),
            "expected": r"\\frac{2}{5}"
        },
    ]

    passes = 0
    total = len(test_cases)

    for t in test_cases:
        pred = extract_prediction(t["output"], t["dataset"])
        success = (pred == t["expected"])
        if success:
            passes += 1

        print(f"Test: {t['name']}")
        print(f"  Dataset: {t['dataset']}")
        print(f"  Output: {t['output']}")
        print(f"  Expected: {t['expected']!r}, Got: {pred!r}")
        print(f"  => {'PASSED' if success else 'FAILED'}\n")

    print(f"Passed {passes}/{total} tests.")


if __name__ == "__main__":
    run_tests()


Test: Fraction inside box - expecting \frac{1}{2}
  Dataset: RandomDataset
  Output: No recognized dataset... final is $\boxed{\frac{1}{2}}$
  Expected: '\\\\frac{1}{2}', Got: '\\frac{1'
  => FAILED

Test: No box at all (should return None)
  Dataset: VitaminC
  Output: This text does not contain any \boxed{} expression.
  Expected: None, Got: None
  => PASSED

Test: Box with empty content (should return None)
  Dataset: SVAMP
  Output: Check \boxed{} here. That's all.
  Expected: None, Got: None
  => PASSED

Test: Multiple boxes, last is fraction
  Dataset: HotpotQA
  Output: Mid: $\boxed{SUPPORTS}$ Then final: $\boxed{\frac{2}{5}}$
  Expected: '\\\\frac{2}{5}', Got: '\\frac{2'
  => FAILED

Passed 2/4 tests.


## Evaluation Metrics Functions

Functions to compare predictions with ground truth

In [None]:
def evaluate_correctness(prediction, ground_truth, dataset_name):
    """Evaluate if the prediction is correct based on dataset type"""

    if dataset_name == "VitaminC" or dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        # Fact verification - direct comparison
        return prediction == ground_truth

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA" or dataset_name == "Bamboogle":
        # QA evaluation - normalize and compare
        return normalize_qa_answers(prediction, ground_truth)

    elif dataset_name == "SVAMP":
        # Math evaluation - numeric comparison
        return evaluate_math_correctness(prediction, ground_truth)

    else:
        # Generic comparison for unknown datasets
        return prediction == ground_truth

def normalize_qa_answers(prediction, ground_truth):
    """Normalize and compare QA answers with flexible matching"""
    if not prediction or not ground_truth:
        return False

    # Handle list or dictionary ground truths
    if isinstance(ground_truth, list):
        ground_truth = " ".join([str(item) for item in ground_truth])
    elif isinstance(ground_truth, dict):
        if "answer" in ground_truth:
            ground_truth = ground_truth["answer"]
        else:
            ground_truth = str(ground_truth)

    # Normalize both strings
    pred_norm = prediction.lower().strip()
    truth_norm = str(ground_truth).lower().strip()

    # Remove punctuation and extra spaces
    pred_norm = re.sub(r'[^\w\s]', '', pred_norm).strip()
    truth_norm = re.sub(r'[^\w\s]', '', truth_norm).strip()

    # Check if prediction contains ground truth or vice versa
    return pred_norm in truth_norm or truth_norm in pred_norm

def evaluate_math_correctness(prediction, ground_truth):
    """Evaluate correctness of math answers with tolerance"""
    try:
        # Convert to numeric values
        if isinstance(prediction, str):
            prediction = float(re.search(r'(-?[\d.]+)', prediction.replace(',', '')).group(1))

        if isinstance(ground_truth, str):
            ground_truth = float(re.search(r'(-?[\d.]+)', ground_truth.replace(',', '')).group(1))

        # Compare with tolerance
        tolerance = 0.01
        return abs(float(prediction) - float(ground_truth)) < tolerance
    except (ValueError, TypeError, AttributeError):
        return False

def get_ground_truth(sample, dataset_name):
    """Extract ground truth from sample based on dataset type"""

    if dataset_name == "VitaminC":
        return sample.get("label", "")

    elif dataset_name == "FEVER" or dataset_name == "FEVEROUS":
        return sample.get("label", "")

    elif dataset_name == "HotpotQA" or dataset_name == "2WikiMultihopQA":
        return sample.get("answer", "")

    elif dataset_name == "SVAMP":
        return sample.get("answer", None)

    elif dataset_name == "Bamboogle":
        return sample.get("answer", "")

    else:
        return None

## Visualization Functions

Functions to create visualizations of results

In [None]:
def visualize_results(results_df, dataset_name):
    """Create visualizations of results for a specific dataset"""
    output_dir = f"results_{dataset_name}"
    os.makedirs(output_dir, exist_ok=True)

    if 'true_label' in results_df.columns and 'prediction' in results_df.columns:
        # Fact verification datasets
        if len(results_df) > 0:
            try:
                labels = sorted(list(set(results_df['true_label'].unique()).union(set(results_df['prediction'].unique()))))

                # Create confusion matrix
                cm = confusion_matrix(
                    results_df['true_label'],
                    results_df['prediction'],
                    labels=labels
                )

                plt.figure(figsize=(10, 8))
                sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
                plt.xlabel('Predicted')
                plt.ylabel('True')
                plt.title(f'Confusion Matrix - {dataset_name}')
                plt.tight_layout()
                plt.savefig(f"{output_dir}/confusion_matrix.png")
                plt.close()

                # Class-wise accuracy
                class_accuracy = {}
                for cls in labels:
                    cls_indices = results_df['true_label'] == cls
                    if cls_indices.any():
                        correct = (results_df.loc[cls_indices, 'prediction'] == cls).sum()
                        class_accuracy[cls] = correct / cls_indices.sum()

                plt.figure(figsize=(10, 6))
                sns.barplot(x=list(class_accuracy.keys()), y=list(class_accuracy.values()))
                plt.ylim(0, 1)
                plt.title(f'Class-wise Accuracy - {dataset_name}')
                plt.ylabel('Accuracy')
                plt.tight_layout()
                plt.savefig(f"{output_dir}/class_accuracy.png")
                plt.close()
            except Exception as e:
                print(f"{RED}Error creating visualizations for {dataset_name}: {e}{ENDC}")

    # Overall accuracy
    plt.figure(figsize=(6, 4))
    accuracy = results_df['correct'].mean()
    plt.bar(['Accuracy'], [accuracy])
    plt.ylim(0, 1)
    plt.title(f'Overall Accuracy - {dataset_name}')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/overall_accuracy.png")
    plt.close()

    # Save sample correct/incorrect examples
    correct_samples = results_df[results_df['correct']].head(3)
    incorrect_samples = results_df[~results_df['correct']].head(3)

    with open(f"{output_dir}/sample_results.txt", 'w') as f:
        f.write(f"=== Sample Correct Examples ({dataset_name}) ===\n\n")
        for _, sample in correct_samples.iterrows():
            f.write(f"Input: {sample.get('input', '')}\n")
            f.write(f"True: {sample.get('true_label', '')}\n")
            f.write(f"Prediction: {sample.get('prediction', '')}\n")
            f.write("\n---\n\n")

        f.write(f"=== Sample Incorrect Examples ({dataset_name}) ===\n\n")
        for _, sample in incorrect_samples.iterrows():
            f.write(f"Input: {sample.get('input', '')}\n")
            f.write(f"True: {sample.get('true_label', '')}\n")
            f.write(f"Prediction: {sample.get('prediction', '')}\n")
            f.write("\n---\n\n")

## Dataset Evaluation Function

Function to evaluate model on each dataset

In [None]:
def evaluate_dataset(model, tokenizer, data, dataset_name):
    """Evaluate model on a specific dataset"""
    print(f"{BOLD}{CYAN}Starting evaluation of {dataset_name} dataset...{ENDC}")

    results = []
    start_time = time.time()

    for i, sample in enumerate(data):
        # Create appropriate prompt
        prompt = create_prompt(sample, dataset_name)

        # Print the prompt being sent to the model
        print(f"\n{BOLD}{'='*80}{ENDC}")
        print(f"{BLUE}{BOLD}SAMPLE {i+1}/{len(data)} - DATASET: {dataset_name}{ENDC}")
        print(f"{BLUE}{'-'*80}{ENDC}")
        print(f"{GREEN}PROMPT:{ENDC}")
        print(f"{GREEN}{prompt}{ENDC}")
        print(f"{BLUE}{'-'*80}{ENDC}")

        # Run inference
        output = run_inference(model, tokenizer, prompt)

        # Print the model's response
        print(f"{PURPLE}MODEL RESPONSE:{ENDC}")
        print(f"{PURPLE}{output}{ENDC}")

        # Extract prediction and ground truth
        prediction = extract_prediction(output, dataset_name, sample)
        true_label = get_ground_truth(sample, dataset_name)

        # Evaluate correctness
        correct = evaluate_correctness(prediction, true_label, dataset_name)
        correct_color = GREEN if correct else RED

        print(f"{BLUE}{'-'*80}{ENDC}")
        print(f"{YELLOW}PREDICTION: {prediction}{ENDC}")
        print(f"{YELLOW}TRUE LABEL: {true_label}{ENDC}")
        print(f"{correct_color}CORRECT: {correct}{ENDC}")
        print(f"{BOLD}{'='*80}{ENDC}")

        # Store result
        result = {
            "input": prompt,
            "output": output,
            "prediction": prediction,
            "true_label": true_label,
            "correct": correct
        }
        results.append(result)

        # Print progress
        if (i+1) % 10 == 0 or i == 0:
            elapsed = time.time() - start_time
            avg_time = elapsed / (i+1)
            remaining = avg_time * (len(data) - i - 1)
            print(f"{CYAN}Processed {i+1}/{len(data)} samples - "
                  f"Avg time per sample: {avg_time:.2f}s - "
                  f"Estimated time remaining: {remaining/60:.1f} minutes{ENDC}")

    # Calculate overall accuracy
    results_df = pd.DataFrame(results)
    accuracy = results_df['correct'].mean()

    print(f"\n{BOLD}Results for {dataset_name}:{ENDC}")
    print(f"{BOLD}Overall accuracy: {accuracy:.2f}{ENDC}")

    # Save results
    output_dir = f"results_{dataset_name}"
    os.makedirs(output_dir, exist_ok=True)
    results_df.to_csv(f"{output_dir}/results.csv", index=False)

    # Create visualizations
    visualize_results(results_df, dataset_name)

    print(f"{GREEN}Results saved to {output_dir}/{ENDC}")

    return accuracy, results_df

## Model Loading and Dataset Evaluation

Load the model and evaluate it on all datasets

In [None]:
# Load model and tokenizer
print("Loading model and tokenizer...")
model_path = "/content/drive/Shareddrives/517 nlp project/Models/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

# Ensure the tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

print(f"{GREEN}Model loaded successfully{ENDC}")

Loading model and tokenizer...
[92mModel loaded successfully[0m


In [None]:
# Define dataset paths
datasets = {
    "VitaminC": "/content/drive/Shareddrives/517 nlp project/data/VitaminC/vitaminc/test.jsonl",
    "2WikiMultihopQA": "/content/drive/Shareddrives/517 nlp project/data/2WikiMultihopQA/truncated_first_150.json",
    "Bamboogle": "/content/drive/Shareddrives/517 nlp project/data/Bamboogle/test.json",
    "FEVER": "/content/drive/Shareddrives/517 nlp project/data/FEVER/fever_test.jsonl",
    "FEVEROUS": "/content/drive/Shareddrives/517 nlp project/data/FEVEROUS/feverous_test.jsonl",
    "HotpotQA": "/content/drive/Shareddrives/517 nlp project/data/HotpotQA/truncated_first_150.json",
    "SVAMP": "/content/drive/Shareddrives/517 nlp project/data/SVAMP/test.json"
}

# Display available datasets
print(f"{CYAN}Available datasets for evaluation:{ENDC}")
for i, dataset_name in enumerate(datasets.keys()):
    print(f"{i+1}. {dataset_name}")

[96mAvailable datasets for evaluation:[0m
1. VitaminC
2. 2WikiMultihopQA
3. Bamboogle
4. FEVER
5. FEVEROUS
6. HotpotQA
7. SVAMP


Test a single sample for all the datasets to ensure everything is working first

In [None]:
# Choose which datasets to evaluate
datasets_to_evaluate = [
    "VitaminC",
    "FEVER",
    "FEVEROUS",
    "HotpotQA",
    "2WikiMultihopQA",
    "SVAMP",
    "Bamboogle"
]  # Evaluate all available datasets

# Initialize results summary
summary = {}

# Process each selected dataset
for dataset_name in datasets_to_evaluate:
    if dataset_name not in datasets:
        print(f"{RED}Dataset {dataset_name} not found in available datasets{ENDC}")
        continue

    # Load dataset
    file_path = datasets[dataset_name]
    data = load_dataset(file_path, dataset_name, limit=1)  # Limit to 1 sample

    if data:
        # Evaluate on the dataset
        accuracy, _ = evaluate_dataset(model, tokenizer, data, dataset_name)
        summary[dataset_name] = accuracy

[96mLoading VitaminC dataset from /content/drive/Shareddrives/517 nlp project/data/VitaminC/vitaminc/test.jsonl...[0m


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[96mLoaded 1 samples from VitaminC.[0m
[1m[96mStarting evaluation of VitaminC dataset...[0m

[94m[1mSAMPLE 1/1 - DATASET: VitaminC[0m
[94m--------------------------------------------------------------------------------[0m
[92mPROMPT:[0m
[92mClaim: Westlife made under 23.5 million sales in the UK .

Evidence: According to the British Phonographic Industry ( BPI ) , Westlife has been certified for 13 million albums and 9.8�million singles , with a total of more than 23 million combined sales in the UK .

Think step-by-step to determine if the evidence SUPPORTS, REFUTES, or provides NOT ENOUGH INFO for the claim.

After your thinking, end your answer with one of these in a latex boxed format:
$\boxed{SUPPORTS}$ or $\boxed{REFUTES}$ or $\boxed{NOT ENOUGH INFO}$

Always end with your answer in a latex boxed format:
$\boxed{<finalAnswer>}$[0m
[94m--------------------------------------------------------------------------------[0m


KeyboardInterrupt: 

In [None]:
# Choose which datasets to evaluate
# You can modify this list to evaluate specific datasets
# datasets_to_evaluate = ["VitaminC", "FEVER"]  # Uncomment to evaluate specific datasets
datasets_to_evaluate = [
    "VitaminC",
    # "FEVER",
    # "FEVEROUS",
    "HotpotQA",
    "2WikiMultihopQA",
    "SVAMP",
    "Bamboogle"
]  # Evaluate all available datasets
# Initialize results summary
summary = {}

# Process each selected dataset
for dataset_name in datasets_to_evaluate:
    if dataset_name not in datasets:
        print(f"{RED}Dataset {dataset_name} not found in available datasets{ENDC}")
        continue

    # Load dataset
    file_path = datasets[dataset_name]
    data = load_dataset(file_path, dataset_name, limit=150)  # Limit to 150 samples

    if data:
        # Evaluate on the dataset
        accuracy, _ = evaluate_dataset(model, tokenizer, data, dataset_name)
        summary[dataset_name] = accuracy

Output hidden; open in https://colab.research.google.com to view.

## Results Visualization

Create summary visualizations of model performance across datasets

In [None]:
# Print summary of results
print(f"\n{BOLD}{CYAN}Summary of Results:{ENDC}")
for dataset, accuracy in summary.items():
    print(f"{dataset}: {accuracy:.4f}")

# Create summary visualization
if summary:
    plt.figure(figsize=(12, 6))
    datasets = list(summary.keys())
    accuracies = list(summary.values())

    # Sort by accuracy
    sorted_indices = np.argsort(accuracies)[::-1]
    sorted_datasets = [datasets[i] for i in sorted_indices]
    sorted_accuracies = [accuracies[i] for i in sorted_indices]

    bars = plt.bar(sorted_datasets, sorted_accuracies)

    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom')

    plt.ylim(0, 1.1)  # Set y limit to 0-1 with a small margin
    plt.xlabel('Datasets')
    plt.ylabel('Accuracy')
    plt.title('Llama-3.2-1B Performance Across Datasets')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("summary_results.png")
    plt.show()

    # Save summary to CSV
    summary_df = pd.DataFrame(list(summary.items()), columns=['Dataset', 'Accuracy'])
    summary_df = summary_df.sort_values('Accuracy', ascending=False)
    summary_df.to_csv("summary_results.csv", index=False)

    print(f"\n{GREEN}Summary saved to summary_results.csv and summary_results.png{ENDC}")
else:
    print(f"\n{RED}No results to visualize{ENDC}")

NameError: name 'BOLD' is not defined