In [1]:
import json
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_results(file_path):
    """Load results from JSON file."""
    with open(file_path, 'r') as f:
        return json.load(f)

In [3]:
def calculate_accuracy(predictions, ground_truths):
    """Calculate accuracy between predictions and ground truths."""
    # Convert Series to lists to avoid pandas Series ambiguity
    if hasattr(predictions, 'tolist'):
        predictions = predictions.tolist()
    if hasattr(ground_truths, 'tolist'):
        ground_truths = ground_truths.tolist()
    
    # Calculate accuracy
    correct = sum(1 for p, g in zip(predictions, ground_truths) if str(p) == str(g))
    return correct / len(predictions) if predictions else 0

In [4]:
def analyze_results(results_file, model_name):
    """Analyze results across different dimensions."""
    results = load_results(results_file)
    
    # Convert results to DataFrame for easier analysis
    df = pd.DataFrame(results)
    
    # 1. Overall Accuracy
    overall_accuracy = calculate_accuracy(df['model_answer'], df['ground_truth'])
    print(f"\n{model_name} Overall Accuracy: {overall_accuracy:.2%}")
    
    # 2. Question-type Analysis
    df['question_type'] = df.apply(lambda x: int(x['question_id'].replace('Q', '')), axis=1)
    df['question_category'] = df['question_type'].map({
        1: 'Direct Recognition',
        2: 'Property Inference',
        3: 'Counterfactual'
    })
    
    question_accuracies = df.groupby('question_category').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Question Type:")
    for q_type, acc in question_accuracies.items():
        print(f"{q_type}: {acc:.2%}")
    
    # 3. Image Type Analysis
    image_type_accuracies = df.groupby('image_type').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Image Type:")
    for img_type, acc in image_type_accuracies.items():
        print(f"{img_type}: {acc:.2%}")

    # 4. Property Category Analysis
    property_accuracies = df.groupby('property_category').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Property Category:")
    for prop, acc in property_accuracies.items():
        print(f"{prop}: {acc:.2%}")
    
    return {
        'overall': overall_accuracy,
        'question_type': question_accuracies,
        'image_type': image_type_accuracies,
        'property': property_accuracies,
        'df': df
    }

In [5]:
# def plot_results(analysis_results, model_name):
#     """Create visualizations for the results."""
#     # Set style
#     plt.style.use('seaborn')
    
#     # Create figure with subplots
#     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
#     # 1. Question Type Accuracy Plot
#     question_acc = analysis_results['question_type']
#     question_acc.plot(kind='bar', ax=ax1)
#     ax1.set_title(f'{model_name}: Accuracy by Question Type')
#     ax1.set_ylabel('Accuracy')
#     ax1.tick_params(axis='x', rotation=45)
    
#     # 2. Image Type Accuracy Plot
#     image_acc = analysis_results['image_type']
#     image_acc.plot(kind='bar', ax=ax2)
#     ax2.set_title(f'{model_name}: Accuracy by Image Type')
#     ax2.set_ylabel('Accuracy')
#     ax2.tick_params(axis='x', rotation=45)
    
#     plt.tight_layout()
#     plt.savefig(f'{model_name}_analysis.png', bbox_inches='tight', dpi=300)
#     plt.close()

In [8]:
# def plot_results(analysis_results, model_name):
#     """Create visualizations for the results."""
#     # Import necessary libraries (add these if not already imported)
#     import matplotlib.pyplot as plt
#     import seaborn as sns
    
#     # Set style using a built-in matplotlib style instead
#     plt.style.use('ggplot')  # Alternative style that's built into matplotlib
#     # Or you can use seaborn directly to set the theme
#     sns.set_theme()  # This will apply the default seaborn styling
    
#     # Create figure with subplots
#     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
#     # 1. Question Type Accuracy Plot
#     question_acc = analysis_results['question_type']
#     sns.barplot(x=question_acc.index, y=question_acc.values, ax=ax1)
#     ax1.set_title(f'{model_name}: Accuracy by Question Type')
#     ax1.set_ylabel('Accuracy')
#     ax1.tick_params(axis='x', rotation=45)
    
#     # 2. Image Type Accuracy Plot
#     image_acc = analysis_results['image_type']
#     sns.barplot(x=image_acc.index, y=image_acc.values, ax=ax2)
#     ax2.set_title(f'{model_name}: Accuracy by Image Type')
#     ax2.set_ylabel('Accuracy')
#     ax2.tick_params(axis='x', rotation=45)
    
#     plt.tight_layout()
#     plt.savefig(f'{model_name}_analysis.png', bbox_inches='tight', dpi=300)
#     plt.close()

In [11]:
def plot_results(analysis_results, model_name):
    """Create visualizations for the results."""
    # Use a simple style that's guaranteed to work
    plt.style.use('default')
    
    # Set color scheme manually
    colors = ['#2ecc71', '#3498db', '#e74c3c']
    
    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # 1. Question Type Accuracy Plot
    question_acc = analysis_results['question_type']
    bars1 = question_acc.plot(kind='bar', ax=ax1, color=colors)
    ax1.set_title(f'{model_name}: Accuracy by Question Type', pad=20)
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('')
    ax1.tick_params(axis='x', rotation=45)
    ax1.grid(True, alpha=0.3)
    
    # Add value labels on top of bars
    for i, v in enumerate(question_acc):
        ax1.text(i, v + 0.01, f'{v:.1%}', ha='center', va='bottom')
    
    # 2. Image Type Accuracy Plot
    image_acc = analysis_results['image_type']
    bars2 = image_acc.plot(kind='bar', ax=ax2, color=colors)
    ax2.set_title(f'{model_name}: Accuracy by Image Type', pad=20)
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3)

    # Add value labels on top of bars
    for i, v in enumerate(image_acc):
        ax2.text(i, v + 0.01, f'{v:.1%}', ha='center', va='bottom')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save figure
    plt.savefig(f'{model_name}_analysis.png', bbox_inches='tight', dpi=300)
    plt.close()

In [14]:
def compare_models(model_results_dict):
    """Compare results across different models."""
    comparison_df = pd.DataFrame({
        model: {
            'Overall Accuracy': results['overall'],
            'Direct Recognition': results['question_type']['Direct Recognition'],
            'Property Inference': results['question_type']['Property Inference'],
            'Counterfactual': results['question_type']['Counterfactual']
        }
        for model, results in model_results_dict.items()
    }).T
    
    # Format the DataFrame to display percentages with two decimal places
    formatted_df = comparison_df.applymap(lambda x: f"{x * 100:.2f}%")

    print("\nModel Comparison:")
    print(formatted_df.to_string())
    
    # Plot model comparison
    plt.figure(figsize=(12, 6))
    comparison_df.plot(kind='bar')
    plt.title('Model Comparison Across Question Types')
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('model_comparison.png', bbox_inches='tight', dpi=300)
    plt.close()

In [18]:
def compare_models_extended(model_results_dict):
    """Compare results across different models for both question types and image types."""
    # Use default style
    plt.style.use('default')
    
    # Create two DataFrames - one for question types and one for image types
    question_df = pd.DataFrame({
        model: {
            'Overall Accuracy': results['overall'],
            'Direct Recognition': results['question_type']['Direct Recognition'],
            'Property Inference': results['question_type']['Property Inference'],
            'Counterfactual': results['question_type']['Counterfactual']
        }
        for model, results in model_results_dict.items()
    }).T

    image_df = pd.DataFrame({
        model: {
            'REAL': results['image_type']['REAL'],
            'ANIMATED': results['image_type']['ANIMATED'],
            'AI_GENERATED': results['image_type']['AI_GENERATED']
        }
        for model, results in model_results_dict.items()
    }).T
    
    # Format DataFrames to display percentages
    formatted_question_df = question_df.applymap(lambda x: f"{x * 100:.2f}%")
    formatted_image_df = image_df.applymap(lambda x: f"{x * 100:.2f}%")

    # Print formatted results
    print("\nModel Comparison by Question Type:")
    print(formatted_question_df.to_string())
    print("\nModel Comparison by Image Type:")
    print(formatted_image_df.to_string())
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
    
    # Plot 1: Question Types
    question_df.plot(kind='bar', ax=ax1, width=0.8)
    ax1.set_title('Model Comparison Across Question Types', pad=20)
    ax1.set_xlabel('')
    ax1.set_ylabel('Accuracy')
    ax1.tick_params(axis='x', rotation=45)
    ax1.set_xticklabels(ax1.get_xticklabels(), ha='right')
    ax1.grid(True, alpha=0.3)
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add value labels for question types
    for i in range(len(question_df)):
        for j in range(len(question_df.columns)):
            value = question_df.iloc[i, j]
            ax1.text(i, value, f'{value:.1%}', 
                    ha='center', va='bottom')
    
    # Plot 2: Image Types
    image_df.plot(kind='bar', ax=ax2, width=0.8)
    ax2.set_title('Model Comparison Across Image Types', pad=20)
    ax2.set_xlabel('Models')
    ax2.set_ylabel('Accuracy')
    ax2.tick_params(axis='x', rotation=45)
    ax2.set_xticklabels(ax2.get_xticklabels(), ha='right')
    ax2.grid(True, alpha=0.3)
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Add value labels for image types
    for i in range(len(image_df)):
        for j in range(len(image_df.columns)):
            value = image_df.iloc[i, j]
            ax2.text(i, value, f'{value:.1%}', 
                    ha='center', va='bottom')
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('model_comparison_extended.png', bbox_inches='tight', dpi=300)
    plt.close()

    return question_df, image_df

# Use this function in your main analysis
model_results = {}

# List of result files to analyze
result_files = {
    'BLIP2-OPT-6.7B': 'blip2-opt6.7b_results.json',
    'BLIP2-OPT-2.7B': 'blip2-opt2.7b_results.json',
    'SmolVLM2-2.2B-Instruct': 'smolVLM2_results.json',
    'InternVL2.5-4B-MPO': 'InternVL2.5_results.json'
}

# Analyze each model
for model_name, file_path in result_files.items():
    if Path(file_path).exists():
        print(f"\nAnalyzing {model_name}...")
        results = analyze_results(file_path, model_name)
        model_results[model_name] = results
        plot_results(results, model_name)

# Compare models if we have multiple results
if len(model_results) > 1:
    question_df, image_df = compare_models_extended(model_results)


Analyzing BLIP2-OPT-6.7B...

BLIP2-OPT-6.7B Overall Accuracy: 18.67%

Accuracy by Question Type:
Counterfactual: 24.00%
Direct Recognition: 20.00%
Property Inference: 12.00%

Accuracy by Image Type:
AI_GENERATED: 26.67%
ANIMATED: 16.67%
REAL: 16.67%

Accuracy by Property Category:
functional: 14.29%
functional/relational: 0.00%
physical: 27.27%
relational: 20.00%
taxonomic: 16.67%
taxonomic/relational: 0.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing BLIP2-OPT-2.7B...

BLIP2-OPT-2.7B Overall Accuracy: 9.33%

Accuracy by Question Type:
Counterfactual: 16.00%
Direct Recognition: 4.00%
Property Inference: 8.00%

Accuracy by Image Type:
AI_GENERATED: 6.67%
ANIMATED: 6.67%
REAL: 13.33%

Accuracy by Property Category:
functional: 7.14%
functional/relational: 0.00%
physical: 9.09%
relational: 20.00%
taxonomic: 5.56%
taxonomic/relational: 100.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing SmolVLM2-2.2B-Instruct...

SmolVLM2-2.2B-Instruct Overall Accuracy: 16.00%

Accuracy by Question Type:
Counterfactual: 16.00%
Direct Recognition: 8.00%
Property Inference: 24.00%

Accuracy by Image Type:
AI_GENERATED: 13.33%
ANIMATED: 23.33%
REAL: 10.00%

Accuracy by Property Category:
functional: 21.43%
functional/relational: 0.00%
physical: 9.09%
relational: 0.00%
taxonomic: 22.22%
taxonomic/relational: 0.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing InternVL2.5-4B-MPO...

InternVL2.5-4B-MPO Overall Accuracy: 14.67%

Accuracy by Question Type:
Counterfactual: 20.00%
Direct Recognition: 0.00%
Property Inference: 24.00%

Accuracy by Image Type:
AI_GENERATED: 6.67%
ANIMATED: 13.33%
REAL: 20.00%

Accuracy by Property Category:
functional: 21.43%
functional/relational: 0.00%
physical: 4.55%
relational: 40.00%
taxonomic: 5.56%
taxonomic/relational: 100.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(
  formatted_question_df = question_df.applymap(lambda x: f"{x * 100:.2f}%")
  formatted_image_df = image_df.applymap(lambda x: f"{x * 100:.2f}%")



Model Comparison by Question Type:
                       Overall Accuracy Direct Recognition Property Inference Counterfactual
BLIP2-OPT-6.7B                   18.67%             20.00%             12.00%         24.00%
BLIP2-OPT-2.7B                    9.33%              4.00%              8.00%         16.00%
SmolVLM2-2.2B-Instruct           16.00%              8.00%             24.00%         16.00%
InternVL2.5-4B-MPO               14.67%              0.00%             24.00%         20.00%

Model Comparison by Image Type:
                          REAL ANIMATED AI_GENERATED
BLIP2-OPT-6.7B          16.67%   16.67%       26.67%
BLIP2-OPT-2.7B          13.33%    6.67%        6.67%
SmolVLM2-2.2B-Instruct  10.00%   23.33%       13.33%
InternVL2.5-4B-MPO      20.00%   13.33%        6.67%


In [15]:
# Analyze results for each model
model_results = {}

# List of result files to analyze
result_files = {
    'BLIP2-OPT-6.7B': 'blip2-opt6.7b_results.json',
    'BLIP2-OPT-2.7B': 'blip2-opt2.7b_results.json',
    'SmolVLM2-2.2B-Instruct': 'smolVLM2_results.json',
    'InternVL2.5-4B-MPO': 'InternVL2.5_results.json'
    # Add other model result files here
    # 'FUYU-8B': 'fuyu_results.json',
}

for model_name, file_path in result_files.items():
    if Path(file_path).exists():
        print(f"\nAnalyzing {model_name}...")
        results = analyze_results(file_path, model_name)
        model_results[model_name] = results
        plot_results(results, model_name)

# Compare models if we have multiple results
if len(model_results) > 1:
    compare_models(model_results)


Analyzing BLIP2-OPT-6.7B...

BLIP2-OPT-6.7B Overall Accuracy: 18.67%

Accuracy by Question Type:
Counterfactual: 24.00%
Direct Recognition: 20.00%
Property Inference: 12.00%

Accuracy by Image Type:
AI_GENERATED: 26.67%
ANIMATED: 16.67%
REAL: 16.67%

Accuracy by Property Category:
functional: 14.29%
functional/relational: 0.00%
physical: 27.27%
relational: 20.00%
taxonomic: 16.67%
taxonomic/relational: 0.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing BLIP2-OPT-2.7B...

BLIP2-OPT-2.7B Overall Accuracy: 9.33%

Accuracy by Question Type:
Counterfactual: 16.00%
Direct Recognition: 4.00%
Property Inference: 8.00%

Accuracy by Image Type:
AI_GENERATED: 6.67%
ANIMATED: 6.67%
REAL: 13.33%

Accuracy by Property Category:
functional: 7.14%
functional/relational: 0.00%
physical: 9.09%
relational: 20.00%
taxonomic: 5.56%
taxonomic/relational: 100.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing SmolVLM2-2.2B-Instruct...

SmolVLM2-2.2B-Instruct Overall Accuracy: 16.00%

Accuracy by Question Type:
Counterfactual: 16.00%
Direct Recognition: 8.00%
Property Inference: 24.00%

Accuracy by Image Type:
AI_GENERATED: 13.33%
ANIMATED: 23.33%
REAL: 10.00%

Accuracy by Property Category:
functional: 21.43%
functional/relational: 0.00%
physical: 9.09%
relational: 0.00%
taxonomic: 22.22%
taxonomic/relational: 0.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(



Analyzing InternVL2.5-4B-MPO...

InternVL2.5-4B-MPO Overall Accuracy: 14.67%

Accuracy by Question Type:
Counterfactual: 20.00%
Direct Recognition: 0.00%
Property Inference: 24.00%

Accuracy by Image Type:
AI_GENERATED: 6.67%
ANIMATED: 13.33%
REAL: 20.00%

Accuracy by Property Category:
functional: 21.43%
functional/relational: 0.00%
physical: 4.55%
relational: 40.00%
taxonomic: 5.56%
taxonomic/relational: 100.00%


  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(
  formatted_df = comparison_df.applymap(lambda x: f"{x * 100:.2f}%")



Model Comparison:
                       Overall Accuracy Direct Recognition Property Inference Counterfactual
BLIP2-OPT-6.7B                   18.67%             20.00%             12.00%         24.00%
BLIP2-OPT-2.7B                    9.33%              4.00%              8.00%         16.00%
SmolVLM2-2.2B-Instruct           16.00%              8.00%             24.00%         16.00%
InternVL2.5-4B-MPO               14.67%              0.00%             24.00%         20.00%


<Figure size 1200x600 with 0 Axes>

In [None]:
def analyze_correlations(df):
    """Analyze correlations between different factors"""
    return {
        'question_image_correlation': df.pivot_table(
            values='accuracy', 
            index='question_type',
            columns='image_type'
        ),
        'property_image_correlation': df.pivot_table(
            values='accuracy',
            index='property_category',
            columns='image_type'
        )
    }

analyze_correlations()