### Installation
Install LLM framework packages for OpenAI, Gemini, Fireworks AI, and Anthropic (Claude) models

In [None]:
%pip install -q langchain-openai       # LLM framework for OpenAI
%pip install -q langchain-core
%pip install -q langchain-google-genai # LLM framework for Gemini
%pip install -q langchain-fireworks   # LLM framework for Fireworks AI (open-source models)
%pip install -q langchain_google_vertexai anthropic[vertex] # LLM framework for Anthropic (Claude models)
%pip install -q langchain
%pip install -q openai
%pip install -q langchain-community
%pip install -q Langchainhub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.9/62.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.8/193.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.4

### API Keys
Set API keys for LLM providers (OpenAI, Gemini, Fireworks, Anthropic via VertexAI)

***Note: Replace placeholder keys with your own keys***

For VertexAI, requires a service account JSON key file

In [None]:
# OpenAI
import os
os.environ["OPENAI_API_KEY"] = "Placeholder"

# Gemini
import google.generativeai as genai
os.environ["GOOGLE_API_KEY"] = "Placeholder"

# Fireworks
import fireworks.client
os.environ["FIREWORKS_API_KEY"] = "Placeholder"

# Anthropic (from Vertex AI)
import json
anthropic_key = {
    "Placeholder"
    }
with open('anthropic_key.json', 'w') as f:
    json.dump(anthropic_key, f, indent=4)
from google.oauth2 import service_account
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "Placeholder"

### Libraries
Imports for multi-LLM setup (OpenAI, Gemini, Fireworks, Anthropic) with LangChain core components

Includes initialization for VertexAI (Anthropic Claude)

In [None]:
# OpenAI
import os
from langchain_openai import ChatOpenAI

# Gemini
from langchain_google_genai import ChatGoogleGenerativeAI

# Fireworks
from langchain_fireworks import ChatFireworks

# Anthropic (from Vertex AI)
os.environ["GOOGLE_API_KEY"] = "Placeholder"
project = "Placeholder"
location = "Placeholder"
import vertexai
vertexai.init(project=project, location=location)
from langchain_google_vertexai.model_garden import ChatAnthropicVertex

# Lanchain (chain)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

### LLM Models
Initialize multi-LLM chat models for different providers:
- OpenAI: GPT-4.1 Mini
- Google: Gemini 2.0 Flash
- Fireworks: Llama 3.3 70B, Qwen3 30B-A3B, DeepSeek V3
- Anthropic: Claude 3.5 Haiku (via VertexAI)

In [None]:
llm_ChatGPT = ChatOpenAI(model = "gpt-4.1-mini")

llm_Gemini = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

llm_Llama = ChatFireworks(model = "accounts/fireworks/models/llama-v3p3-70b-instruct") #Llama 3.3 70B Instruct

llm_Qwen = ChatFireworks(model = "accounts/fireworks/models/qwen3-235b-a22b") #Qwen3 235B-A22B

llm_DeepSeek = ChatFireworks(model = "accounts/fireworks/models/deepseek-v3") #DeepSeek V3

llm_Claude = ChatAnthropicVertex(model_name="claude-3-5-haiku@20241022",project=project,location=location) # Claude 3.5 Haiku


### Prompt
Purpose: Forces LLM to output ONLY the letter corresponding to the correct answer choice
Behavior:
  1. Performs analysis of context and choices
  2. Returns strictly formatted single-letter response (A/B/C/D)
  3. Explicitly prohibits explanations or additional text

Format Enforcement:
- Uses clear instruction boundaries (### Instruction: / ### Response Format:)
- Provides concrete example of expected input/output
- Emphasizes strict compliance through wording ("only", "strictly avoid")

In [None]:
prompt = ChatPromptTemplate.from_template(
"""
### Instruction:
You are a reasoning assistant.

1. Analyze the given context thoroughly.
2. Identify the best option from the provided choices.

### Response Format:
- Provide **only** the letter corresponding to the correct answer (e.g., A, B, C, or D).
- Strictly avoid additional text, explanations, or context in your response.

EXAMPLE INPUT
 Dolores Huerta’s advocacy on behalf of farmworkers was rooted in her experience as a schoolteacher in Stockton,
 California, in the early 1950s. Hoping to help her students and their families outside the ______ Huerta left teaching to
 start the Stockton chapter of the Community Service Organization, a group focused on the needs of local farmworkers.

 Which choice completes the text so that it conforms to the conventions of Standard English?
 A. classroom.
 B. classroom;
 C. classroom,
 D. classroom

 EXAMPLE OUTPUT
 C

{input}
"""
)

## Import questions
SAT Question Data Loader
Imports and formats SAT practice questions from a GitHub repository.

Data Structure:
- Loads pipe-separated (#) CSV data from GitHub raw URL
- Contains question metadata (ID, Difficulty, Type of Question/Skill)
- Includes question text, 4 multiple-choice options (A-D), and correct answer
- Formats into pandas DataFrame with descriptive column names

In [None]:
import pandas as pd

# GitHub raw URL containing pipe-separated SAT question data
url = "https://raw.githubusercontent.com/1082098-LWSD/SAT-question-evaluation/main/SATQuestions"

# Import data with custom separator and no header
sat_questions = pd.read_csv(
    url,
    sep="#",            # Custom pipe separator
    engine="python",        # Required for custom separators
    header=None           # No existing header row
)

# Apply human-readable column names
sat_questions.columns = [
    "ID",             # Unique question identifier
    "Difficulty",         # Question difficulty level
    "Type of Question",     # Question category/type
    "Question",          # Question text
    "Option A",         # Multiple choice option A
    "Option B",         # Multiple choice option B
    "Option C",         # Multiple choice option C
    "Option D",         # Multiple choice option D
    "Correct Answer"       # Letter of correct option (A-D)
]

# Preview the structured data
sat_questions.head()

Unnamed: 0,ID,Difficulty,Type of Question,Question,Option A,Option B,Option C,Option D,Correct Answer
0,87aa7bab,medium,central ideas and details,A common assumption among art historians is th...,A. Factors other than the rise of photography ...,B. Although portrait miniatures became less co...,C. The popularity of the portrait miniature li...,D. As demand for portrait miniatures decreased...,a
1,d748c3fd,medium,inferences,In her 2021 article “Throwaway History: Toward...,A. demonstrate the difficulties faced by conte...,B. represent the challenge of incorporating ex...,C. lend support to arguments by historians and...,D. illustrate both the relatively low scholarl...,d
2,22e4d633,medium,command of evidence,"Although many transposons, DNA sequences that ...",A. The LINE transposon in O. vulgaris and O. b...,B. The human genome contains multiple transpos...,C. A consistent number of copies of LINE trans...,D. O. vulgaris and O. bimaculoides have smalle...,a
3,359902ae,medium,words in context,The following text is adapted from Nathaniel H...,A. A lonely,B. A disagreeable,C. An acceptable,D. An extraordinary,d
4,2af2016f,medium,text structure and purpose,A study by Dr. Paul Hanel and colleagues concl...,A. To describe a widely held belief and how a ...,B. To argue that researchers were surprised by...,C. To suggest ways to improve a certain study’...,D. To explain a study’s conclusion and how a r...,d


### Constructing Query
Construct standardized query strings for LLM processing by combining:
1. Question context
2. All multiple-choice options (A-D)

Format:

Context and Question: [question text]

Options:

[Option A]

[Option B]

[Option C]

[Option D]


In [None]:
sat_questions['Query'] = (
    "Context and Question: " + sat_questions['Question'] + "\n"
    + "Options: "
    + sat_questions['Option A'] + "\n"
    + sat_questions['Option B'] + "\n"
    + sat_questions['Option C'] + "\n"
    + sat_questions['Option D']
)


### SAT Question Evaluation System

This script evaluates a singular LLM's performance on SAT-style questions across multiple dimensions:
- Difficulty levels (easy/medium/hard)
- Question types (Writing, Reading, Math)
- Skill subdivisions (e.g., Boundaries, Command of Evidence)

Key Features:
1. Multi-run evaluation: Each question is processed 3 times to assess consistency
2. Error handling: Automatic retries with delay for failed attempts
3. Comprehensive analysis:
   - Accuracy by difficulty and question type
   - Response variability (consistency across runs)
   - Skill subdivision breakdowns
4. Data preservation: Saves raw results and aggregated metrics

Output Includes:
1. Accuracy pivot tables (% correct by subdivision/difficulty)
2. Count tables (question distribution)
3. Full accuracy analysis (all categories combined)
4. CSV exports of raw results and aggregated metrics

Usage Notes:
- Configure model_name and model at start
- Set questions_per_difficulty for sampling
- Adjust max_retries and retry_delay as needed

In [None]:
from collections import Counter
import pandas as pd
import time

# Define the model to evaluate
model_name = "llm_ChatGPT" # Model identifier for reporting
model = llm_ChatGPT # Actual model instance

# Define all possible categories
all_difficulties = ['easy', 'medium', 'hard']
skill_subdivisions = {
    'Standard English Conventions': ["boundaries", "form, structure, and sense"],
    'Information and Ideas': ["central ideas and details", "inferences", "command of evidence"],
    'Craft and Structure': ["words in context", "text structure and purpose", "cross-text connections"],
    'Expression of Ideas': ["rhetorical synthesis", "transitions"]
}
all_subdivisions = list(skill_subdivisions.keys())
all_skills = [skill for sublist in skill_subdivisions.values() for skill in sublist]

# Initialize an empty list to store results
results = []

# Number of questions to evaluate per difficulty level
questions_per_difficulty = 30
max_retries = 3  # Maximum number of retry attempts
retry_delay = 5   # Seconds to wait between retries

# Loop through each question in the DataFrame
for _, row in sat_questions.iterrows():
    if len(results) >= questions_per_difficulty * 3:
        break

    query = row['Query']
    correct_answer = row['Correct Answer']
    difficulty = row['Difficulty']
    skill = row['Type of Question']

    model_responses = []
    retry_count = 0

    # Run the model 3 times for the current query
    for i in range(3):
        while retry_count < max_retries:
            try:
                chain = (
                    RunnablePassthrough.assign(context=(lambda x: x["input"]))
                    | prompt
                    | model
                    | StrOutputParser()
                )
                result = chain.invoke({
                    'input': query,
                    'temperature': 0,
                    'max_tokens': 2,
                })
                model_responses.append(result.strip().upper())
                break  # Success - exit retry loop
            except Exception as e:  # Catch all exceptions
                retry_count += 1
                if retry_count >= max_retries:
                    model_responses.append("ERROR")
                    print(f"Failed to get response for query after {max_retries} attempts: {query}")
                    print(f"Error: {str(e)}")
                else:
                    print(f"Retry {retry_count} for query: {query}")
                    time.sleep(retry_delay)

    # Filter out error responses before calculating mode
    valid_responses = [r for r in model_responses if r != "ERROR"]
    if not valid_responses:
        most_frequent_answer = "ERROR"
        variability = 0
    else:
        most_frequent_answer = Counter(valid_responses).most_common(1)[0][0]
        variability = len(set(valid_responses))

    normalized_correct_answer = str(correct_answer).strip().upper()
    is_correct = most_frequent_answer == normalized_correct_answer if most_frequent_answer != "ERROR" else False

    subdivision = next((subdiv for subdiv, skills in skill_subdivisions.items()
                      if skill in skills), None)

    results.append({
        'Query': query,
        'Difficulty': difficulty,
        'Type of Question': skill,
        'Subdivision': subdivision,
        'Correct Answer': normalized_correct_answer,
        'Model Mode Answer': most_frequent_answer,
        'Is Correct': is_correct,
        'Variability': variability,
        'Responses': model_responses,
        'Attempts': 3 - model_responses.count("ERROR")
    })

# Convert results into DataFrame
results_df = pd.DataFrame(results)

# Create two-way tables for accuracy and counts
accuracy_pivot = results_df.pivot_table(
    index='Subdivision',
    columns='Difficulty',
    values='Is Correct',
    aggfunc=lambda x: (x.sum() / len(x)) * 100,
    fill_value=0
).round(1)

count_pivot = results_df.pivot_table(
    index='Subdivision',
    columns='Difficulty',
    values='Is Correct',
    aggfunc='count',
    fill_value=0
)

# Add totals
accuracy_pivot['Total'] = results_df.groupby('Subdivision')['Is Correct'].mean() * 100
count_pivot['Total'] = results_df.groupby('Subdivision')['Is Correct'].count()

accuracy_pivot.loc['Total'] = results_df.groupby('Difficulty')['Is Correct'].mean() * 100
count_pivot.loc['Total'] = results_df.groupby('Difficulty')['Is Correct'].count()

# Format tables - using map() instead of applymap()
accuracy_pivot = accuracy_pivot.map(lambda x: f"{x}%" if pd.notna(x) else "0%")
count_pivot = count_pivot.map(lambda x: f"{int(x)}" if pd.notna(x) else "0")

print("\n" + "="*60)
print("Accuracy by Subdivision and Difficulty (% correct):")
print("="*60)
print(accuracy_pivot)

print("\n" + "="*60)
print("Number of Questions by Subdivision and Difficulty:")
print("="*60)
print(count_pivot)

# Create template DataFrames with all categories
def create_template_df(categories, name_col):
    return pd.DataFrame({name_col: categories,
                        'total_questions': 0,
                        'correct_answers': 0,
                        'average_variability': 0.0})

difficulty_template = create_template_df(all_difficulties, 'Difficulty')
subdivision_template = create_template_df(all_subdivisions, 'Subdivision')
skill_template = create_template_df(all_skills, 'Type of Question')

# Calculate actual results
def calculate_accuracy(df, group_col):
    if group_col in df.columns:
        actual = df.groupby(group_col).agg(
            total_questions=('Is Correct', 'size'),
            correct_answers=('Is Correct', 'sum'),
            average_variability=('Variability', 'mean')
        ).reset_index()
        actual.columns = [group_col, 'total_questions', 'correct_answers', 'average_variability']
        return actual
    return None

# Merge actual results with templates
def merge_with_template(actual, template, name_col):
    if actual is not None:
        merged = template.merge(actual, on=name_col, how='left', suffixes=('_template', ''))
        for col in ['total_questions', 'correct_answers', 'average_variability']:
            merged[col] = merged[col+'_template'].where(pd.isna(merged[col]), merged[col])
        return merged.drop(columns=[col+'_template' for col in ['total_questions', 'correct_answers', 'average_variability']])
    return template

difficulty_accuracy = merge_with_template(
    calculate_accuracy(results_df, 'Difficulty'),
    difficulty_template,
    'Difficulty'
)

subdivision_accuracy = merge_with_template(
    calculate_accuracy(results_df, 'Subdivision'),
    subdivision_template,
    'Subdivision'
)

skill_accuracy = merge_with_template(
    calculate_accuracy(results_df, 'Type of Question'),
    skill_template,
    'Type of Question'
)

# Calculate accuracy percentage
for df in [difficulty_accuracy, subdivision_accuracy, skill_accuracy]:
    df['Accuracy (%)'] = (df['correct_answers'] / df['total_questions'].replace(0, 1)) * 100
    df['Accuracy (%)'] = df['Accuracy (%)'].fillna(0)

# Combine all accuracy tables
full_accuracy_table = pd.concat([
    difficulty_accuracy.assign(Category='Difficulty', Name=difficulty_accuracy['Difficulty']),
    subdivision_accuracy.assign(Category='Subdivision', Name=subdivision_accuracy['Subdivision']),
    skill_accuracy.assign(Category='Type of Question', Name=skill_accuracy['Type of Question'])
])[['Category', 'Name', 'total_questions', 'correct_answers', 'Accuracy (%)', 'average_variability']]

# Display the full accuracy table
print("\n" + "="*60)
print("Full Accuracy Analysis:")
print("="*60)
print(full_accuracy_table)

# Save results
results_df.to_csv('mistral_results_with_skills.csv', index=False)
full_accuracy_table.to_csv('mistral_accuracy_analysis.csv', index=False)


Accuracy by Subdivision and Difficulty (% correct):
Difficulty                                medium   Total
Subdivision                                             
Craft and Structure                       100.0%  100.0%
Expression of Ideas                       100.0%  100.0%
Information and Ideas                     100.0%  100.0%
Standard English Conventions                0.0%    0.0%
Total                         88.88888888888889%      0%

Number of Questions by Subdivision and Difficulty:
Difficulty                   medium Total
Subdivision                              
Craft and Structure               3     3
Expression of Ideas               2     2
Information and Ideas             3     3
Standard English Conventions      1     1
Total                             9     0

Full Accuracy Analysis:
           Category                          Name  total_questions  \
0        Difficulty                          easy                0   
1        Difficulty                  

  merged[col] = merged[col+'_template'].where(pd.isna(merged[col]), merged[col])
  merged[col] = merged[col+'_template'].where(pd.isna(merged[col]), merged[col])
  merged[col] = merged[col+'_template'].where(pd.isna(merged[col]), merged[col])
  merged[col] = merged[col+'_template'].where(pd.isna(merged[col]), merged[col])


### SAT Question Evaluation System (Multi-Model)
This script evaluates LLM performance on SAT questions through:  

**Core Features**  
- Cross-Model Comparison: Tests multiple LLMs on same question subset  
- Difficulty Analysis: Evaluates easy/medium/hard questions equally (30 per level)  
- Skill Breakdown: Analyzes 4 skill categories and 12 subtypes  
- Low-Accuracy Tracking: Identifies top {N} hardest questions across all models  
- Consistency Metrics: 3 attempts/question with variability scoring  

**Key Processes**  
1. Stratified Sampling: Balanced question selection by difficulty  
2. Error-Resilient Evaluation: 3 retries per question with delay  
3. Multi-Dimensional Tracking:  
   - Accuracy (%) by difficulty/skill  
   - Response consistency (1.0 = perfect stability)  
   - Model processing speed  

**Outputs Generated**  
- Accuracy Tables: By difficulty/skill/subtype  
- Variability Reports: Consistency metrics across attempts  
- Time Metrics: Processing speed per model  
- Low-Accuracy Export: CSV of hardest questions  
- Raw Data: Full results in full_evaluation_results.csv  

**Configuration Points**  
- NUM_LOW_ACCURACY_QUESTIONS: Set number of challenging questions to track  
- questions_per_difficulty: Questions per difficulty level (default:30)  
- max_retries/delay: Error handling parameters  
- Model List: Comment/uncomment models in evaluation pool  

In [None]:
from collections import defaultdict, Counter
import pandas as pd
import time
import re
import numpy as np
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# ====== 1. INITIALIZATION ======
models = [
    ("Claude 3.5 Haiku", llm_Claude),
    ("ChatGPT 4.1 mini", llm_ChatGPT),
    ("Gemini 2.0 Flash", llm_Gemini),
    ("Llama 3.3 70B Instruct", llm_Llama),
    ("Qwen3 235B-A22B", llm_Qwen),
    ("DeepSeek V3", llm_DeepSeek)
]

questions_per_difficulty = 30
total_questions_per_model = questions_per_difficulty * 3
max_retries = 3
retry_delay = 5
NUM_LOW_ACCURACY_QUESTIONS = 10

skill_subdivisions = {
    'Standard English Conventions': ["boundaries", "form, structure, and sense"],
    'Information and Ideas': ["central ideas and details", "inferences", "command of evidence"],
    'Craft and Structure': ["words in context", "text structure and purpose", "cross-text connections"],
    'Expression of Ideas': ["rhetorical synthesis", "transitions"]
}

# ====== 2. DATA VERIFICATION ======
required_columns = [
    "ID", "Question", "Correct Answer", "Difficulty",
    "Type of Question", "Option A", "Option B",
    "Option C", "Option D"
]
missing = [col for col in required_columns if col not in sat_questions.columns]
if missing:
    raise KeyError(f"Missing columns: {missing}")

for diff in ['easy', 'medium', 'hard']:
    count = len(sat_questions[sat_questions['Difficulty'] == diff])
    if count < questions_per_difficulty:
        raise ValueError(f"Need {questions_per_difficulty} {diff} questions, found {count}")

shared_questions = pd.DataFrame()
for diff in ['easy', 'medium', 'hard']:
    subset = sat_questions[sat_questions['Difficulty'] == diff].sample(n=questions_per_difficulty)
    shared_questions = pd.concat([shared_questions, subset])

if len(shared_questions) != total_questions_per_model:
    raise ValueError(f"Sampling failed. Expected {total_questions_per_model}, got {len(shared_questions)}")

# ====== 3. ANSWER PROCESSING FUNCTIONS ======
def extract_answer(response: str) -> str:
    """Clean and extract first valid answer from model response"""
    if hasattr(response, 'content'):
        content = response.content
    else:
        content = str(response)

    content = content.upper().strip()
    standalone_match = re.search(r'\b([A-D])\b', content.split('.')[0])
    if standalone_match:
        return standalone_match.group(1)

    for char in content:
        if char in ('A', 'B', 'C', 'D'):
            return char

    return "ERROR"

# ====== 4. EVALUATION PIPELINE ======
all_results = []
time_tracking = {}
question_stats = {}

for _, row in shared_questions.iterrows():
    question_stats[row['ID']] = {
        'Question': row['Question'],
        'Correct Answer': row['Correct Answer'].strip().upper(),
        'Difficulty': row['Difficulty'],
        'Type of Question': row['Type of Question'],
        'correct_attempts': 0,
        'total_attempts': 0
    }

for model_name, model in models:
    print(f"\nEvaluating {model_name}...")
    start_time = time.time()
    model_results = []

    for _, row in shared_questions.iterrows():
        query = (
            f"Question: {row['Question']}\n"
            f"A) {row['Option A']}\n"
            f"B) {row['Option B']}\n"
            f"C) {row['Option C']}\n"
            f"D) {row['Option D']}"
        )
        correct_answer = row['Correct Answer'].strip().upper()
        responses = []
        retry_count = 0

        for _ in range(3):
            current_retry = 0
            while current_retry < max_retries:
                try:
                    result = model.invoke(prompt.format_prompt(input=query))
                    cleaned_response = extract_answer(result)
                    responses.append(cleaned_response)
                    break
                except Exception as e:
                    current_retry += 1
                    if current_retry >= max_retries:
                        responses.append("ERROR")
                        print(f"Failed processing question: {e}")
                    else:
                        time.sleep(retry_delay)

        valid_responses = [r for r in responses if r != 'ERROR']
        if not valid_responses:
            mode_response = 'ERROR'
        else:
            counts = Counter(valid_responses)
            max_count = max(counts.values())
            modes = [k for k, v in counts.items() if v == max_count]
            mode_response = modes[0]

        q_id = row['ID']
        question_stats[q_id]['total_attempts'] += 1
        if mode_response == correct_answer:
            question_stats[q_id]['correct_attempts'] += 1

        # Store all three responses for variability calculation
        model_results.append({
            "Question_ID": q_id,
            "Model": model_name,
            "Difficulty": row['Difficulty'],
            "Skill Type": row['Type of Question'],
            "Response": mode_response,
            "Correct Answer": correct_answer,
            "All_Responses": responses  # Store all responses
        })

    time_tracking[model_name] = round(time.time() - start_time, 2)
    all_results.extend(model_results)
    print(f"Completed {model_name} in {time_tracking[model_name]}s")

# ====== 5. ANALYSIS PIPELINE ======
stats_df = pd.DataFrame.from_dict(question_stats, orient='index').reset_index()
stats_df = stats_df.rename(columns={'index': 'Question_ID'})
stats_df['Accuracy (%)'] = (stats_df['correct_attempts'] / stats_df['total_attempts']) * 100

low_accuracy_df = stats_df.sort_values('Accuracy (%)').head(NUM_LOW_ACCURACY_QUESTIONS)
low_accuracy_df = low_accuracy_df[[
    'Question_ID',
    'Difficulty',
    'Type of Question',
    'correct_attempts',
    'total_attempts',
    'Accuracy (%)',
    'Question'
]].rename(columns={
    'correct_attempts': 'Correct Answers',
    'total_attempts': 'Total Answers'
})

print(f"\n{'#'*40}")
print(f"{NUM_LOW_ACCURACY_QUESTIONS} Lowest Accuracy Questions")
print(low_accuracy_df.round(1).to_string(index=False))
low_accuracy_df.to_csv('low_accuracy_questions.csv', index=False)

def full_analysis(model_df):
    skill_to_subdivision = {}
    for subdivision, skills in skill_subdivisions.items():
        for skill in skills:
            skill_to_subdivision[skill] = subdivision

    model_df = model_df.copy()
    model_df['Subdivision'] = model_df['Skill Type'].map(skill_to_subdivision)
    model_df['is_correct'] = np.where(
        model_df['Response'] == model_df['Correct Answer'],
        True,
        False
    )

    # Calculate per-question variability
    def calculate_variability(responses):
        valid = [r for r in responses if r != 'ERROR']
        return len(set(valid)) if valid else 0

    model_df['variability'] = model_df['All_Responses'].apply(calculate_variability)

    analysis = []
    categories = {
        'Difficulty': ['easy', 'medium', 'hard'],
        'Subdivision': list(skill_subdivisions.keys()),
        'Skill Type': [skill for sublist in skill_subdivisions.values() for skill in sublist]
    }

    for cat_type, cat_list in categories.items():
        temp_df = pd.DataFrame({cat_type: cat_list})
        grouped = model_df.groupby(cat_type, observed=True).agg(
            total=('is_correct', 'size'),
            correct=('is_correct', 'sum'),
            avg_variability=('variability', 'mean')
        ).reset_index()

        merged = temp_df.merge(grouped, how='left', on=cat_type).fillna(0)
        merged['Category'] = cat_type

        if cat_type == 'Difficulty':
            total_row = pd.DataFrame({
                cat_type: ['Total'],
                'total': [merged['total'].sum()],
                'correct': [merged['correct'].sum()],
                'avg_variability': [model_df['variability'].mean()],
                'Category': ['Difficulty']
            })
            merged = pd.concat([merged, total_row], ignore_index=True)

        analysis.append(merged.rename(columns={cat_type: 'Name'}))

    full = pd.concat(analysis)
    full['Accuracy (%)'] = (full['correct'] / full['total'].replace(0, 1)) * 100
    return full[['Category', 'Name', 'total', 'correct', 'Accuracy (%)', 'avg_variability']]

# Generate reports
results_df = pd.DataFrame(all_results)
for model in results_df['Model'].unique():
    analysis = full_analysis(results_df[results_df['Model'] == model])
    print(f"\n{'#'*40}\nFull Analysis for {model}\n{'#'*40}")
    print(analysis.to_string(
        formatters={
            'Accuracy (%)': '{:.1f}%'.format,
            'avg_variability': '{:.2f}'.format
        },
        index=False
    ))

results_df.to_csv('full_evaluation_results.csv', index=False)
print("\nEvaluation complete! Results saved to full_evaluation_results.csv")


Evaluating Claude 3.5 Haiku...
Completed Claude 3.5 Haiku in 319.5s

Evaluating ChatGPT 4.1 mini...
Completed ChatGPT 4.1 mini in 184.5s

Evaluating Gemini 2.0 Flash...
Completed Gemini 2.0 Flash in 94.51s

Evaluating Llama 3.3 70B Instruct...
Completed Llama 3.3 70B Instruct in 843.92s

Evaluating Qwen3 235B-A22B...
Completed Qwen3 235B-A22B in 2750.02s

Evaluating DeepSeek V3...
Completed DeepSeek V3 in 221.84s

########################################
10 Lowest Accuracy Questions
Question_ID Difficulty           Type of Question  Correct Answers  Total Answers  Accuracy (%)                                                                                                                                                                                                                                                                                                                                                                                                                                