# Descriptor Performance Overview

This notebook provides a comprehensive overview of descriptor performance across different configurations.

## Analysis Objectives:
1. **Performance Baseline**: Establish baseline performance for different descriptor types
2. **Comparative Analysis**: Compare performance across descriptor families
3. **Dataset Analysis**: Performance breakdown by scene type (illumination vs viewpoint)
4. **Metrics Deep Dive**: MAP, P@K, processing times


In [None]:
# Import required libraries
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

# Database connection
DB_PATH = "../../build/experiments.db"

In [None]:
# Connect to database and load experiment data
def load_experiment_data():
    conn = sqlite3.connect(DB_PATH)
    
    query = """
    SELECT 
        e.id as experiment_id,
        e.descriptor_type,
        e.pooling_strategy,
        e.timestamp as exp_timestamp,
        r.mean_average_precision,
        r.precision_at_1,
        r.precision_at_5,
        r.recall_at_1,
        r.recall_at_5,
        r.total_matches,
        r.total_keypoints,
        r.processing_time_ms,
        r.timestamp as result_timestamp
    FROM experiments e 
    JOIN results r ON e.id = r.experiment_id
    ORDER BY e.timestamp DESC
    """
    
    df = pd.read_sql_query(query, conn)
    conn.close()
    
    return df

# Load data
df = load_experiment_data()
print(f"Loaded {len(df)} experiment results")
df.head()

In [None]:
# Data preprocessing and feature extraction
def extract_descriptor_features(df):
    """Extract features from descriptor names for analysis"""
    df = df.copy()
    
    # Extract color information
    df['uses_color'] = df['descriptor_type'].str.contains('rgb', case=False)
    
    # Extract base descriptor type
    df['base_descriptor'] = df['descriptor_type'].str.extract(r'(sift|rgbsift|honc|vgg|dnn)', expand=False).str.upper()
    
    # Extract normalization information from descriptor name
    df['normalization'] = 'L2'  # Default
    df.loc[df['descriptor_type'].str.contains('l1', case=False), 'normalization'] = 'L1'
    
    # Clean pooling strategy names
    pooling_map = {
        'none': 'None',
        'domain_size_pooling': 'DSP',
        'stacking': 'Stacking'
    }
    df['pooling_clean'] = df['pooling_strategy'].map(pooling_map).fillna(df['pooling_strategy'])
    
    return df

df_processed = extract_descriptor_features(df)
print("\nProcessed descriptor features:")
print(f"Base descriptors: {df_processed['base_descriptor'].unique()}")
print(f"Pooling strategies: {df_processed['pooling_clean'].unique()}")
print(f"Normalization types: {df_processed['normalization'].unique()}")

## Performance Overview Dashboard

In [None]:
# Create comprehensive performance dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Mean Average Precision by Descriptor Type',
        'Precision@1 vs Precision@5',
        'Processing Time vs Performance',
        'Performance by Pooling Strategy'
    ),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Plot 1: MAP by descriptor type
desc_performance = df_processed.groupby(['base_descriptor', 'pooling_clean'])['mean_average_precision'].mean().reset_index()
for pooling in desc_performance['pooling_clean'].unique():
    data = desc_performance[desc_performance['pooling_clean'] == pooling]
    fig.add_trace(
        go.Bar(name=f'{pooling}', x=data['base_descriptor'], y=data['mean_average_precision']),
        row=1, col=1
    )

# Plot 2: P@1 vs P@5 scatter
fig.add_trace(
    go.Scatter(
        x=df_processed['precision_at_1'],
        y=df_processed['precision_at_5'],
        mode='markers',
        text=df_processed['descriptor_type'],
        marker=dict(
            size=10,
            color=df_processed['mean_average_precision'],
            colorscale='Viridis',
            showscale=True,
            colorbar=dict(title="MAP")
        ),
        showlegend=False
    ),
    row=1, col=2
)

# Plot 3: Processing time vs performance
fig.add_trace(
    go.Scatter(
        x=df_processed['processing_time_ms'] / 1000,  # Convert to seconds
        y=df_processed['mean_average_precision'],
        mode='markers',
        text=df_processed['descriptor_type'],
        marker=dict(
            size=8,
            color=df_processed['pooling_clean'].astype('category').cat.codes,
            colorscale='Set1'
        ),
        showlegend=False
    ),
    row=2, col=1
)

# Plot 4: Performance by pooling strategy
pooling_performance = df_processed.groupby('pooling_clean')['mean_average_precision'].agg(['mean', 'std']).reset_index()
fig.add_trace(
    go.Bar(
        x=pooling_performance['pooling_clean'],
        y=pooling_performance['mean'],
        error_y=dict(type='data', array=pooling_performance['std']),
        showlegend=False
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=800,
    title_text="Descriptor Performance Analysis Dashboard",
    showlegend=True
)

# Update axis labels
fig.update_xaxes(title_text="Base Descriptor", row=1, col=1)
fig.update_yaxes(title_text="Mean Average Precision", row=1, col=1)

fig.update_xaxes(title_text="Precision@1", row=1, col=2)
fig.update_yaxes(title_text="Precision@5", row=1, col=2)

fig.update_xaxes(title_text="Processing Time (seconds)", row=2, col=1)
fig.update_yaxes(title_text="Mean Average Precision", row=2, col=1)

fig.update_xaxes(title_text="Pooling Strategy", row=2, col=2)
fig.update_yaxes(title_text="Mean Average Precision", row=2, col=2)

fig.show()

## Statistical Analysis

In [None]:
# Statistical summary by key factors
print("=== PERFORMANCE SUMMARY ===")
print("\n1. Overall Performance Statistics:")
print(df_processed[['mean_average_precision', 'precision_at_1', 'precision_at_5']].describe())

print("\n2. Performance by Base Descriptor:")
base_desc_stats = df_processed.groupby('base_descriptor')[['mean_average_precision', 'precision_at_1', 'precision_at_5']].agg(['mean', 'std', 'count'])
print(base_desc_stats.round(4))

print("\n3. Performance by Pooling Strategy:")
pooling_stats = df_processed.groupby('pooling_clean')[['mean_average_precision', 'precision_at_1', 'precision_at_5']].agg(['mean', 'std', 'count'])
print(pooling_stats.round(4))

print("\n4. Performance by Color Usage:")
color_stats = df_processed.groupby('uses_color')[['mean_average_precision', 'precision_at_1', 'precision_at_5']].agg(['mean', 'std', 'count'])
print(color_stats.round(4))

In [None]:
# Export results for further analysis
output_dir = Path("../outputs")
output_dir.mkdir(exist_ok=True)

# Save processed data
df_processed.to_csv(output_dir / "descriptor_performance_analysis.csv", index=False)

# Save summary statistics
with open(output_dir / "performance_summary.txt", 'w') as f:
    f.write("DESCRIPTOR PERFORMANCE ANALYSIS SUMMARY\n")
    f.write("=" * 50 + "\n\n")
    
    f.write("BASE DESCRIPTOR PERFORMANCE:\n")
    f.write(base_desc_stats.to_string())
    f.write("\n\nPOOLING STRATEGY PERFORMANCE:\n")
    f.write(pooling_stats.to_string())
    f.write("\n\nCOLOR USAGE PERFORMANCE:\n")
    f.write(color_stats.to_string())

print(f"Analysis results exported to {output_dir}")
print(f"Total experiments analyzed: {len(df_processed)}")