In [None]:
# Setup
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('__file__'), '../..')))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Environment setup complete")

## Load Results from Both Experiments

In [None]:
# Load Experiment 1 results
exp1_path = 'experiments/exer_recog/results/exp_01_baseline/backbone_comparison.csv'
exp1_df = pd.read_csv(exp1_path, index_col=0)
exp1_df['experiment'] = 'Baseline'

# Load Experiment 2 results
exp2_path = 'experiments/exer_recog/results/exp_02_progressive/backbone_comparison_exp2.csv'
exp2_df = pd.read_csv(exp2_path, index_col=0)
exp2_df['experiment'] = 'Progressive'

# Combine
combined_df = pd.concat([exp1_df, exp2_df])
combined_df['backbone'] = combined_df.index

print("Experiment 1 Summary:")
print(exp1_df[['mean_test_acc', 'std_test_acc']].to_string())
print("\nExperiment 2 Summary:")
print(exp2_df[['mean_test_acc', 'std_test_acc']].to_string())

## Accuracy Comparison

In [None]:
# Create comparison bar plot
fig, ax = plt.subplots(figsize=(14, 6))

backbones = exp1_df.index.tolist()
x = np.arange(len(backbones))
width = 0.35

# Plot bars
bars1 = ax.bar(x - width/2, exp1_df['mean_test_acc'], width, 
               yerr=exp1_df['std_test_acc'], label='Baseline', 
               alpha=0.8, capsize=5)
bars2 = ax.bar(x + width/2, exp2_df['mean_test_acc'], width,
               yerr=exp2_df['std_test_acc'], label='Progressive',
               alpha=0.8, capsize=5)

# Formatting
ax.set_ylabel('Mean Test Accuracy', fontsize=12)
ax.set_title('Baseline vs Progressive Training: Accuracy Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(backbones, rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('experiments/exer_recog/results/cross_experiment_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Comparison plot saved")

## Statistical Analysis

In [None]:
# Compare training strategies
print("="*80)
print("STATISTICAL COMPARISON: Baseline vs Progressive")
print("="*80)

for backbone in backbones:
    exp1_acc = exp1_df.loc[backbone, 'mean_test_acc']
    exp2_acc = exp2_df.loc[backbone, 'mean_test_acc']
    
    improvement = ((exp2_acc - exp1_acc) / exp1_acc) * 100
    
    print(f"\n{backbone}:")
    print(f"  Baseline:    {exp1_acc:.4f} ± {exp1_df.loc[backbone, 'std_test_acc']:.4f}")
    print(f"  Progressive: {exp2_acc:.4f} ± {exp2_df.loc[backbone, 'std_test_acc']:.4f}")
    print(f"  Improvement: {improvement:+.2f}%")

# Overall statistics
print("\n" + "="*80)
print("OVERALL STATISTICS")
print("="*80)
print(f"\nBaseline Mean:    {exp1_df['mean_test_acc'].mean():.4f} ± {exp1_df['mean_test_acc'].std():.4f}")
print(f"Progressive Mean: {exp2_df['mean_test_acc'].mean():.4f} ± {exp2_df['mean_test_acc'].std():.4f}")

# Paired t-test
t_stat, p_value = stats.ttest_rel(exp1_df['mean_test_acc'], exp2_df['mean_test_acc'])
print(f"\nPaired t-test: t={t_stat:.4f}, p={p_value:.4f}")
if p_value < 0.05:
    print("✓ Progressive training shows STATISTICALLY SIGNIFICANT improvement (p < 0.05)")
else:
    print("✗ No statistically significant difference between methods (p ≥ 0.05)")

## Best Model Selection

In [None]:
print("="*80)
print("BEST MODEL SELECTION")
print("="*80)

# Find best model overall
best_idx = combined_df['mean_test_acc'].idxmax()
best_row = combined_df.loc[best_idx]

print(f"\nBest Overall Model:")
print(f"  Backbone:   {best_row['backbone']}")
print(f"  Strategy:   {best_row['experiment']}")
print(f"  Accuracy:   {best_row['mean_test_acc']:.4f} ± {best_row['std_test_acc']:.4f}")

# Top 3 models
print("\nTop 3 Models:")
top3 = combined_df.nlargest(3, 'mean_test_acc')[['backbone', 'experiment', 'mean_test_acc', 'std_test_acc']]
for idx, row in top3.iterrows():
    print(f"  {row['backbone']:20s} ({row['experiment']:12s}): {row['mean_test_acc']:.4f} ± {row['std_test_acc']:.4f}")

print("\n" + "="*80)

## Performance vs Complexity

In [None]:
# Load model parameters
from src.utils import get_all_model_parameters

backbones_list = [
    'efficientnet_b0',
    'efficientnet_b2',
    'efficientnet_b3',
    'resnet50',
    'vgg16',
    'mobilenet_v2',
    'mobilenet_v3_large',
]

params_df = get_all_model_parameters(backbones_list, img_size=224, num_classes=15)

# Merge with results
exp1_with_params = exp1_df.merge(params_df, left_index=True, right_on='backbone')
exp2_with_params = exp2_df.merge(params_df, left_index=True, right_on='backbone')

# Scatter plot
fig, ax = plt.subplots(figsize=(12, 6))

ax.scatter(exp1_with_params['total_params']/1e6, exp1_with_params['mean_test_acc'], 
           s=100, alpha=0.6, label='Baseline')
ax.scatter(exp2_with_params['total_params']/1e6, exp2_with_params['mean_test_acc'],
           s=100, alpha=0.6, label='Progressive')

# Annotate points
for _, row in exp1_with_params.iterrows():
    ax.annotate(row['backbone'], (row['total_params']/1e6, row['mean_test_acc']), 
                fontsize=8, alpha=0.7)

ax.set_xlabel('Model Parameters (millions)', fontsize=12)
ax.set_ylabel('Mean Test Accuracy', fontsize=12)
ax.set_title('Performance vs Model Complexity', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('experiments/exer_recog/results/performance_vs_complexity.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Performance vs complexity plot saved")

## Summary Report

In [None]:
# Generate comprehensive summary
summary_path = 'experiments/exer_recog/results/COMPARISON_SUMMARY.md'

with open(summary_path, 'w') as f:
    f.write("# Exercise Recognition: Cross-Experiment Comparison\n\n")
    
    f.write("## Overview\n\n")
    f.write("This report compares two training strategies across 7 CNN backbones:\n")
    f.write("- **Experiment 1 (Baseline):** 2-phase transfer learning\n")
    f.write("- **Experiment 2 (Progressive):** 3-stage progressive unfreezing with custom heads\n\n")
    
    f.write("## Results Summary\n\n")
    f.write("### Experiment 1 (Baseline)\n\n")
    f.write(exp1_df[['mean_test_acc', 'std_test_acc', 'num_runs']].to_markdown())
    f.write("\n\n### Experiment 2 (Progressive)\n\n")
    f.write(exp2_df[['mean_test_acc', 'std_test_acc']].to_markdown())
    
    f.write("\n\n## Best Model\n\n")
    f.write(f"**Backbone:** {best_row['backbone']}\n\n")
    f.write(f"**Strategy:** {best_row['experiment']}\n\n")
    f.write(f"**Accuracy:** {best_row['mean_test_acc']:.4f} ± {best_row['std_test_acc']:.4f}\n\n")
    
    f.write("## Statistical Analysis\n\n")
    f.write(f"**Paired t-test:** t={t_stat:.4f}, p={p_value:.4f}\n\n")
    if p_value < 0.05:
        f.write("✓ Progressive training shows **statistically significant** improvement over baseline.\n")
    else:
        f.write("✗ No statistically significant difference between methods.\n")
    
    f.write("\n## Recommendations\n\n")
    f.write("Based on the analysis:\n\n")
    f.write(f"1. **Best accuracy:** Use {best_row['backbone']} with {best_row['experiment'].lower()} training\n")
    f.write("2. **Best efficiency:** Consider MobileNet variants for deployment\n")
    f.write("3. **Best balance:** EfficientNet models offer strong accuracy with moderate complexity\n")

print(f"✓ Summary report saved to: {summary_path}")
print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)