In [1]:

import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import argparse
import math
import itertools

# Import necessary functions from the original code
from LLMs_attack import (
    load_model, eval, gen_prompt, format_example,
    move_answers_to_position, format_subject,
    full_search_eval, reduce_choices_and_answer
)

# Create a directory for data
os.makedirs('data/MMLU/dev', exist_ok=True)
os.makedirs('data/MMLU/test', exist_ok=True)

# Download a subset of MMLU dataset
!wget -q -O data/MMLU/dev/abstract_algebra_dev.csv https://raw.githubusercontent.com/hendrycks/test/master/data/dev/abstract_algebra_dev.csv
!wget -q -O data/MMLU/test/abstract_algebra_test.csv https://raw.githubusercontent.com/hendrycks/test/master/data/test/abstract_algebra_test.csv
!wget -q -O data/MMLU/dev/anatomy_dev.csv https://raw.githubusercontent.com/hendrycks/test/master/data/dev/anatomy_dev.csv
!wget -q -O data/MMLU/test/anatomy_test.csv https://raw.githubusercontent.com/hendrycks/test/master/data/test/anatomy_test.csv
!wget -q -O data/MMLU/dev/computer_security_dev.csv https://raw.githubusercontent.com/hendrycks/test/master/data/dev/computer_security_dev.csv
!wget -q -O data/MMLU/test/computer_security_test.csv https://raw.githubusercontent.com/hendrycks/test/master/data/test/computer_security_test.csv
!wget -q -O data/MMLU/dev/high_school_mathematics_dev.csv https://raw.githubusercontent.com/hendrycks/test/master/data/dev/high_school_mathematics_dev.csv
!wget -q -O data/MMLU/test/high_school_mathematics_test.csv https://raw.githubusercontent.com/hendrycks/test/master/data/test/high_school_mathematics_test.csv

# Define custom Args object to pass to functions
class Args:
    def __init__(self):
        self.ntrain = 5  # Number of examples to use for few-shot
        self.data_dir = "data/MMLU"
        self.engine = ["gemma", "llama", "qwen"]  # Models to evaluate
        self.n_reduced = None  # No reduction in choices by default
        self.use_subset = True  # Use a subset of the test data for faster evaluation
        self.permutation_attack = False  # Don't use permutation attack by default
        self.position_permute = False  # Don't use position permutation by default
        self.reduce_attack = False  # Don't use reduce attack by default
        self.load_in_8bit = False  # Don't load in 8-bit by default

args = Args()

# ## Experiment 1: Basic MMLU Performance Comparison
# 
# We'll compare the performance of three small models on a subset of MMLU benchmark.

# Set subjects to evaluate
subjects = ['abstract_algebra', 'anatomy', 'computer_security', 'high_school_mathematics']
print(f"Evaluating models on subjects: {subjects}")

# Modify args to use subset
args.use_subset = True  # Use only 100 examples from test set
args.ntrain = 5  # Use 5-shot examples for few-shot learning

# Define results dictionary
basic_results = {}

# Set the models to evaluate
args.engine = ["gemma", "llama", "qwen"]  # Using Qwen as third model instead of Mistral

for engine in args.engine:
    print(f"\n=====================================")
    print(f"Engine: {engine}")
    print(f"=====================================")
    
    all_cors = []
    all_accs = []
    subject_results = {}
    
    # Load model
    model, tokenizer = load_model(args, engine)
    
    # Move to GPU if not using 8-bit quantization
    if not args.load_in_8bit and torch.cuda.is_available():
        model.cuda()
    
    for subject in subjects:
        print(f"Evaluating {subject}...")
        
        # Load development and test data
        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
        
        # Use only first 20 examples for faster evaluation
        test_df = test_df[:20]
        
        # Evaluate model
        cors, acc = eval(args, format_subject(subject), dev_df, test_df, model, tokenizer, n_reduced=args.n_reduced)
        
        # Store results
        all_cors.append(cors)
        all_accs.append(acc)
        subject_results[subject] = acc
    
    # Calculate overall accuracy
    weighted_acc = np.mean(np.concatenate(all_cors))
    print(f"Average accuracy: {weighted_acc*100:.2f}%")
    
    # Store results
    basic_results[engine] = {
        'subject_accs': subject_results,
        'overall_acc': weighted_acc
    }
    
    # Clean up
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Plot basic results
plt.figure(figsize=(12, 6))
engines = list(basic_results.keys())
subjects_list = list(basic_results[engines[0]]['subject_accs'].keys())

# Create bar chart
x = np.arange(len(subjects_list))
width = 0.25
multiplier = 0

for engine in engines:
    offset = width * multiplier
    accs = [basic_results[engine]['subject_accs'][subject] * 100 for subject in subjects_list]
    plt.bar(x + offset, accs, width, label=engine)
    multiplier += 1

# Add labels and title
plt.xlabel('Subjects')
plt.ylabel('Accuracy (%)')
plt.title('MMLU Performance by Model and Subject (5-shot)')
plt.xticks(x + width, [subject.replace('_', ' ').title() for subject in subjects_list], rotation=45)
plt.legend(loc='upper left')
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show overall accuracies
plt.figure(figsize=(8, 5))
plt.bar(engines, [basic_results[engine]['overall_acc'] * 100 for engine in engines])
plt.xlabel('Models')
plt.ylabel('Accuracy (%)')
plt.title('Overall MMLU Accuracy by Model (5-shot)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.ylim(0, 50)  # Assuming accuracies are within this range

# ## Experiment 2: Position Bias Analysis
# 
# Testing if the models have a bias towards choosing answers at specific positions (A, B, C, D)

# Choose one model for position bias analysis to save time
args.engine = ["gemma"]  # Using Gemma for position bias analysis
args.position_permute = True  # Enable position permutation

position_results = {}
engines = args.engine

for engine in engines:
    print(f"\n=====================================")
    print(f"Position Bias Analysis for Engine: {engine}")
    print(f"=====================================")
    
    all_accs = []
    
    # Load model
    model, tokenizer = load_model(args, engine)
    
    # Move to GPU if not using 8-bit quantization
    if not args.load_in_8bit and torch.cuda.is_available():
        model.cuda()
    
    for subject in subjects:
        print(f"Evaluating {subject}...")
        
        # Load development and test data
        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
        
        # Use only first 10 examples for faster evaluation
        test_df = test_df[:10]
        
        # Test with answers at different positions
        position_accs = {}
        for position in ['A', 'B', 'C', 'D']:
            print(f"Testing with answers at position {position}...")
            new_df = move_answers_to_position(test_df, position)
            cors, acc = eval(args, format_subject(subject), dev_df, new_df, model, tokenizer, n_reduced=args.n_reduced, permute_pos=position)
            position_accs[position] = acc
        
        all_accs.append(position_accs)
        print(f"Position accuracies for {subject}: {position_accs}")
    
    # Store results
    position_results[engine] = all_accs
    
    # Clean up
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Reset position_permute flag
args.position_permute = False

# Plot position bias results
plt.figure(figsize=(10, 6))
positions = ['A', 'B', 'C', 'D']

# Calculate average accuracy for each position across all subjects
avg_position_accs = {}
for engine in position_results:
    avg_position_accs[engine] = {pos: 0 for pos in positions}
    for subject_result in position_results[engine]:
        for pos in positions:
            avg_position_accs[engine][pos] += subject_result[pos]
    # Divide by number of subjects
    for pos in positions:
        avg_position_accs[engine][pos] /= len(subjects)

# Create bar chart for position bias
for engine in avg_position_accs:
    plt.bar(positions, [avg_position_accs[engine][pos] * 100 for pos in positions], label=engine)
    
plt.xlabel('Answer Position')
plt.ylabel('Accuracy (%)')
plt.title('Position Bias Analysis: Accuracy by Answer Position')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.ylim(0, 50)  # Assuming accuracies are within this range

# ## Experiment 3: Shot Analysis
# 
# Testing how performance changes with different numbers of few-shot examples

# Choose one model for shot analysis
args.engine = ["llama"]  # Using Llama for shot analysis
shot_counts = [0, 1, 3, 5]  # Different numbers of shots to test

shot_results = {}
engines = args.engine

for engine in engines:
    print(f"\n=====================================")
    print(f"Shot Analysis for Engine: {engine}")
    print(f"=====================================")
    
    engine_results = {}
    
    # Load model
    model, tokenizer = load_model(args, engine)
    
    # Move to GPU if not using 8-bit quantization
    if not args.load_in_8bit and torch.cuda.is_available():
        model.cuda()
    
    for subject in subjects[:2]:  # Using only first two subjects to save time
        print(f"Evaluating {subject}...")
        
        subject_results = {}
        
        # Load test data
        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
        test_df = test_df[:10]  # Use only 10 examples
        
        for shot_count in shot_counts:
            print(f"Testing with {shot_count} shots...")
            
            # Update ntrain parameter
            args.ntrain = shot_count
            
            # Load development data with appropriate number of examples
            dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
            
            # Evaluate model
            cors, acc = eval(args, format_subject(subject), dev_df, test_df, model, tokenizer, n_reduced=args.n_reduced)
            
            subject_results[shot_count] = acc
            print(f"{shot_count}-shot accuracy: {acc * 100:.2f}%")
        
        engine_results[subject] = subject_results
    
    # Store results
    shot_results[engine] = engine_results
    
    # Clean up
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Plot shot analysis results
plt.figure(figsize=(10, 6))

for engine in shot_results:
    for subject in shot_results[engine]:
        plt.plot(shot_counts, 
                [shot_results[engine][subject][shot] * 100 for shot in shot_counts], 
                label=f"{engine} - {subject.replace('_', ' ').title()}", 
                marker='o')

plt.xlabel('Number of Few-Shot Examples')
plt.ylabel('Accuracy (%)')
plt.title('Effect of Shot Count on MMLU Performance')
plt.legend()
plt.grid(linestyle='--', alpha=0.7)
plt.xticks(shot_counts)
plt.ylim(0, 50)  # Assuming accuracies are within this range

# ## Experiment 4: Reduced Choices Attack
# 
# Testing how models perform when the number of choices is reduced (e.g., from 4 to 2 or 3)

# Choose one model for reduced choices analysis
args.engine = ["qwen"]  # Using Qwen for reduced choices analysis
args.reduce_attack = True  # Enable reduce attack

reduced_results = {}
engines = args.engine

for engine in engines:
    print(f"\n=====================================")
    print(f"Reduced Choices Analysis for Engine: {engine}")
    print(f"=====================================")
    
    engine_results = {}
    
    # Load model
    model, tokenizer = load_model(args, engine)
    
    # Move to GPU if not using 8-bit quantization
    if not args.load_in_8bit and torch.cuda.is_available():
        model.cuda()
    
    for subject in subjects[:1]:  # Using only one subject to save time
        print(f"Evaluating {subject}...")
        
        subject_results = {}
        
        # Load development and test data
        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None)[:args.ntrain]
        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None)
        
        # Use only first 5 examples for faster evaluation
        test_df = test_df[:5]
        
        # Test with different numbers of reduced choices
        for n_reduced in [2, 3, 4]:  # Original is 4 choices, test with 2, 3, 4 choices
            print(f"Testing with {n_reduced} choices...")
            
            args.n_reduced = n_reduced
            
            # Run evaluation
            cors, acc = full_search_eval(args, format_subject(subject), dev_df, test_df, model, tokenizer, n_reduced=n_reduced)
            
            subject_results[n_reduced] = acc
            print(f"Accuracy with {n_reduced} choices: {acc * 100:.2f}%")
        
        engine_results[subject] = subject_results
    
    # Store results
    reduced_results[engine] = engine_results
    
    # Clean up
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Reset reduce_attack flag
args.reduce_attack = False

# Plot reduced choices results
plt.figure(figsize=(8, 5))

for engine in reduced_results:
    for subject in reduced_results[engine]:
        plt.plot([2, 3, 4], 
                [reduced_results[engine][subject][n] * 100 for n in [2, 3, 4]], 
                label=f"{engine} - {subject.replace('_', ' ').title()}", 
                marker='o')

plt.xlabel('Number of Choices')
plt.ylabel('Accuracy (%)')
plt.title('Effect of Number of Choices on Performance')
plt.legend()
plt.grid(linestyle='--', alpha=0.7)
plt.xticks([2, 3, 4])
plt.ylim(0, 100)  # Assuming accuracies are within this range

# ## Summary of Findings

# Print summary table of basic results
print("\nModel Performance Summary (5-shot):")
print("-" * 50)
print(f"{'Model':<10} | {'Overall Acc (%)':<15} | {'Best Subject':<20} | {'Worst Subject':<20}")
print("-" * 50)

for engine in basic_results:
    subject_accs = basic_results[engine]['subject_accs']
    best_subject = max(subject_accs, key=subject_accs.get)
    worst_subject = min(subject_accs, key=subject_accs.get)
    
    print(f"{engine:<10} | {basic_results[engine]['overall_acc']*100:>13.2f}% | "
          f"{best_subject.replace('_', ' ').title():<20} | "
          f"{worst_subject.replace('_', ' ').title():<20}")

# Print position bias summary
if position_results:
    print("\nPosition Bias Summary:")
    print("-" * 50)
    
    for engine in avg_position_accs:
        print(f"Model: {engine}")
        print(f"{'Position':<10} | {'Accuracy (%)':<15}")
        print("-" * 30)
        
        # Sort positions by accuracy (descending)
        sorted_positions = sorted(positions, key=lambda pos: avg_position_accs[engine][pos], reverse=True)
        
        for pos in sorted_positions:
            print(f"{pos:<10} | {avg_position_accs[engine][pos]*100:>13.2f}%")
        
        print()

KeyboardInterrupt: 

### 1. Introduction and Setup

The analysis begins with the setup of necessary libraries and modules for data manipulation, machine learning, and visualization. Custom functions from a module named `LLMs_attack` are imported to facilitate model loading and evaluation.

- **Libraries Imported:**
  - `os`, `numpy`, `pandas`, `torch`, `matplotlib`, `seaborn`, and others for data handling and visualization.
  - Custom functions from `LLMs_attack` for model operations.

- **Data Directory Creation:**
  - Directories are created to store the MMLU dataset subsets, which are then downloaded for evaluation.

- **Custom Args Class:**
  - A custom `Args` class is defined to manage configuration parameters for the experiments, such as the number of training examples, models to evaluate, and flags for different attack strategies.

### 2. Experiment 1: Basic MMLU Performance Comparison

This experiment compares the performance of three models (`gemma`, `llama`, `qwen`) on a subset of the MMLU benchmark. The subjects evaluated include `abstract_algebra`, `anatomy`, `computer_security`, and `high_school_mathematics`.

- **Subjects Evaluated:**
  - The models are tested on four subjects to assess their performance across different domains.

- **Model Evaluation:**
  - For each model, the code loads the model and tokenizer, evaluates it on the specified subjects, and stores the results.
  - The evaluation uses a 5-shot learning approach, where the model is given five examples to learn from before testing.

- **Visualization:**
  - The results are visualized using bar charts to compare the performance of each model across different subjects.

### 3. Experiment 2: Position Bias Analysis

This experiment investigates whether the models exhibit a bias towards choosing answers at specific positions (A, B, C, D). The analysis is performed using the `gemma` model.

- **Position Bias:**
  - The code evaluates the model's performance when the correct answer is placed at different positions to identify any position bias.

- **Visualization:**
  - The results are visualized to show the accuracy for each answer position, highlighting any potential biases.

### 4. Experiment 3: Shot Analysis

This experiment examines how the performance of the `llama` model changes with different numbers of few-shot examples (0, 1, 3, 5).

- **Few-Shot Learning:**
  - The model's performance is evaluated with varying numbers of few-shot examples to understand the impact of example quantity on accuracy.

- **Visualization:**
  - The results are visualized to show the impact of the number of few-shot examples on performance.

### 5. Experiment 4: Reduced Choices Attack

This experiment evaluates the performance of the `qwen` model when the number of choices is reduced from 4 to 2 or 3.

- **Reduced Choices:**
  - The model's performance is evaluated with a reduced number of choices to understand how it affects accuracy.

- **Visualization:**
  - The results are visualized to show the impact of reducing the number of choices on performance.

### 6. Summary of Findings

The analysis concludes with a summary of the findings, including the overall performance of the models and any identified biases.

- **Performance Summary:**
  - A table summarizes the overall accuracy of each model and highlights the best and worst-performing subjects.

- **Position Bias Summary:**
  - The position bias analysis is summarized to identify any consistent biases across the models.
