# Model Evaluation & Results Analysis

Comprehensive evaluation of trained image captioning models.

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import sys

sys.path.append('../')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load Trained Model

In [None]:
from models.cnn_lstm import CNNLSTMModel
from data.dataset import get_dataloaders

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load model
checkpoint_path = '../checkpoints/cnn_lstm_best.pth'
model, vocab, epoch, loss = CNNLSTMModel.load_from_checkpoint(checkpoint_path, device)

print(f"\n✓ Model loaded successfully!")
print(f"  Checkpoint: {checkpoint_path}")
print(f"  Epoch: {epoch}")
print(f"  Validation Loss: {loss:.4f}")
print(f"  Vocabulary Size: {len(vocab):,}")

## 2. Run Comprehensive Evaluation

In [None]:
# Evaluate on test set
!python ../training/evaluate.py \
    --model cnn_lstm \
    --checkpoint ../checkpoints/cnn_lstm_best.pth \
    --beam_size 3 \
    --save_results

## 3. Load and Visualize Results

In [None]:
# Load evaluation results
results_dir = Path('../results')

# Load metrics
metrics_df = pd.read_csv(results_dir / 'cnn_lstm_metrics.csv')
print("\nEvaluation Metrics:")
print("="*60)
print(metrics_df.to_string(index=False))
print("="*60)

In [None]:
# Visualize metrics
metrics = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'meteor']
values = [metrics_df[m].values[0] for m in metrics]
labels = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', 'METEOR']

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
colors = ['#3498db', '#2ecc71', '#f39c12', '#e74c3c', '#9b59b6']
bars = ax1.bar(labels, values, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Score')
ax1.set_title('Model Performance Metrics', fontweight='bold')
ax1.set_ylim(0, 1.0)
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, value in zip(bars, values):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02,
             f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

# BLEU progression
bleu_values = values[:4]
bleu_labels = labels[:4]
ax2.plot(range(1, 5), bleu_values, marker='o', linewidth=2, markersize=10, color='steelblue')
ax2.set_xlabel('N-gram')
ax2.set_ylabel('BLEU Score')
ax2.set_title('BLEU Score Progression', fontweight='bold')
ax2.set_xticks(range(1, 5))
ax2.set_xticklabels(bleu_labels)
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Analyze Predictions

In [None]:
# Load predictions
predictions_df = pd.read_csv(results_dir / 'cnn_lstm_predictions.csv')

print(f"\nTotal predictions: {len(predictions_df):,}")
print(f"\nFirst 5 predictions:")
print("="*80)
for i in range(5):
    print(f"\n{i+1}. Reference: {predictions_df.iloc[i]['reference']}")
    print(f"   Generated: {predictions_df.iloc[i]['hypothesis']}")
print("="*80)

In [None]:
# Calculate caption lengths
predictions_df['ref_length'] = predictions_df['reference'].apply(lambda x: len(x.split()))
predictions_df['hyp_length'] = predictions_df['hypothesis'].apply(lambda x: len(x.split()))

# Length comparison
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Histogram comparison
axes[0].hist(predictions_df['ref_length'], bins=20, alpha=0.5, label='Reference', color='blue', edgecolor='black')
axes[0].hist(predictions_df['hyp_length'], bins=20, alpha=0.5, label='Generated', color='red', edgecolor='black')
axes[0].set_xlabel('Caption Length (words)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Caption Length Distribution')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Scatter plot
axes[1].scatter(predictions_df['ref_length'], predictions_df['hyp_length'], alpha=0.3, s=10)
axes[1].plot([0, 30], [0, 30], 'r--', label='Perfect match')
axes[1].set_xlabel('Reference Length')
axes[1].set_ylabel('Generated Length')
axes[1].set_title('Length Correlation')
axes[1].legend()
axes[1].grid(alpha=0.3)

# Box plot comparison
axes[2].boxplot([predictions_df['ref_length'], predictions_df['hyp_length']], 
                labels=['Reference', 'Generated'],
                patch_artist=True)
axes[2].set_ylabel('Caption Length (words)')
axes[2].set_title('Length Statistics')
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nLength Statistics:")
print(f"  Reference - Mean: {predictions_df['ref_length'].mean():.2f}, Std: {predictions_df['ref_length'].std():.2f}")
print(f"  Generated - Mean: {predictions_df['hyp_length'].mean():.2f}, Std: {predictions_df['hyp_length'].std():.2f}")

## 5. Qualitative Analysis - Best Predictions

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Calculate per-caption BLEU-4 scores
smoothing = SmoothingFunction()
predictions_df['bleu4'] = predictions_df.apply(
    lambda row: sentence_bleu(
        [row['reference'].split()],
        row['hypothesis'].split(),
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothing.method1
    ),
    axis=1
)

# Get best predictions
best_preds = predictions_df.nlargest(10, 'bleu4')

print("\n" + "="*80)
print("TOP 10 BEST PREDICTIONS (Highest BLEU-4 Scores)")
print("="*80)
for i, (idx, row) in enumerate(best_preds.iterrows(), 1):
    print(f"\n{i}. BLEU-4: {row['bleu4']:.4f}")
    print(f"   Reference: {row['reference']}")
    print(f"   Generated: {row['hypothesis']}")
print("="*80)

## 6. Qualitative Analysis - Worst Predictions

In [None]:
# Get worst predictions
worst_preds = predictions_df.nsmallest(10, 'bleu4')

print("\n" + "="*80)
print("TOP 10 WORST PREDICTIONS (Lowest BLEU-4 Scores)")
print("="*80)
for i, (idx, row) in enumerate(worst_preds.iterrows(), 1):
    print(f"\n{i}. BLEU-4: {row['bleu4']:.4f}")
    print(f"   Reference: {row['reference']}")
    print(f"   Generated: {row['hypothesis']}")
print("="*80)

In [None]:
# Distribution of BLEU-4 scores
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.hist(predictions_df['bleu4'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
plt.axvline(predictions_df['bleu4'].mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {predictions_df["bleu4"].mean():.3f}')
plt.xlabel('BLEU-4 Score')
plt.ylabel('Frequency')
plt.title('Distribution of Per-Caption BLEU-4 Scores')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
predictions_df['bleu4'].plot(kind='box', vert=False)
plt.xlabel('BLEU-4 Score')
plt.title('BLEU-4 Score Distribution (Box Plot)')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nBLEU-4 Statistics:")
print(f"  Mean: {predictions_df['bleu4'].mean():.4f}")
print(f"  Median: {predictions_df['bleu4'].median():.4f}")
print(f"  Std: {predictions_df['bleu4'].std():.4f}")
print(f"  Min: {predictions_df['bleu4'].min():.4f}")
print(f"  Max: {predictions_df['bleu4'].max():.4f}")

## 7. Error Analysis

In [None]:
# Categorize predictions by BLEU-4 score
def categorize_score(score):
    if score >= 0.5:
        return 'Excellent (≥0.5)'
    elif score >= 0.3:
        return 'Good (0.3-0.5)'
    elif score >= 0.15:
        return 'Fair (0.15-0.3)'
    else:
        return 'Poor (<0.15)'

predictions_df['category'] = predictions_df['bleu4'].apply(categorize_score)

# Count by category
category_counts = predictions_df['category'].value_counts()

# Visualize
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
category_counts.plot(kind='bar', color=['green', 'blue', 'orange', 'red'], alpha=0.7, edgecolor='black')
plt.xlabel('Quality Category')
plt.ylabel('Number of Predictions')
plt.title('Prediction Quality Distribution')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%',
       colors=['green', 'blue', 'orange', 'red'], startangle=90)
plt.title('Prediction Quality Percentage')

plt.tight_layout()
plt.show()

print("\nPrediction Quality Breakdown:")
print("="*40)
for cat, count in category_counts.items():
    pct = count / len(predictions_df) * 100
    print(f"{cat}: {count} ({pct:.1f}%)")
print("="*40)

## 8. Model Comparison (If Multiple Models Trained)

In [None]:
# If you have trained multiple models, load and compare them
# Example comparison structure

comparison_data = {
    'Model': ['CNN-LSTM', 'CNN-Transformer', 'ViT-GPT2'],
    'BLEU-4': [0.213, 0.247, 0.281],
    'METEOR': [0.198, 0.217, 0.239],
    'Training Time (hrs)': [2, 6, 12],
    'Parameters (M)': [28, 52, 124]
}

comparison_df = pd.DataFrame(comparison_data)

print("\nModel Comparison:")
print("="*70)
print(comparison_df.to_string(index=False))
print("="*70)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Metric comparison
x = np.arange(len(comparison_df))
width = 0.35

axes[0].bar(x - width/2, comparison_df['BLEU-4'], width, label='BLEU-4', alpha=0.7)
axes[0].bar(x + width/2, comparison_df['METEOR'], width, label='METEOR', alpha=0.7)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Performance Comparison')
axes[0].set_xticks(x)
axes[0].set_xticklabels(comparison_df['Model'])
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Performance vs Training Time
axes[1].scatter(comparison_df['Training Time (hrs)'], comparison_df['BLEU-4'], 
               s=comparison_df['Parameters (M)'] * 3, alpha=0.6, c=['blue', 'green', 'red'])
axes[1].set_xlabel('Training Time (hours)')
axes[1].set_ylabel('BLEU-4 Score')
axes[1].set_title('Performance vs Training Time\n(bubble size = model parameters)')
for i, model in enumerate(comparison_df['Model']):
    axes[1].annotate(model, 
                    (comparison_df['Training Time (hrs)'][i], comparison_df['BLEU-4'][i]),
                    xytext=(5, 5), textcoords='offset points')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Summary Report

In [None]:
print("\n" + "="*80)
print("EVALUATION SUMMARY REPORT")
print("="*80)

print(f"\nModel: CNN-LSTM (ResNet50 + 2-layer LSTM)")
print(f"Checkpoint: {checkpoint_path}")
print(f"Training Epochs: {epoch}")

print(f"\n1. Quantitative Results:")
print(f"   BLEU-1: {metrics_df['bleu1'].values[0]:.4f}")
print(f"   BLEU-2: {metrics_df['bleu2'].values[0]:.4f}")
print(f"   BLEU-3: {metrics_df['bleu3'].values[0]:.4f}")
print(f"   BLEU-4: {metrics_df['bleu4'].values[0]:.4f}")
print(f"   METEOR: {metrics_df['meteor'].values[0]:.4f}")

print(f"\n2. Caption Quality Distribution:")
for cat, count in category_counts.items():
    pct = count / len(predictions_df) * 100
    print(f"   {cat}: {count} captions ({pct:.1f}%)")

print(f"\n3. Caption Length Analysis:")
print(f"   Reference captions: {predictions_df['ref_length'].mean():.2f} ± {predictions_df['ref_length'].std():.2f} words")
print(f"   Generated captions: {predictions_df['hyp_length'].mean():.2f} ± {predictions_df['hyp_length'].std():.2f} words")

print(f"\n4. Best Prediction (BLEU-4: {predictions_df['bleu4'].max():.4f}):")
best = predictions_df.loc[predictions_df['bleu4'].idxmax()]
print(f"   Reference: {best['reference']}")
print(f"   Generated: {best['hypothesis']}")

print(f"\n5. Key Insights:")
print(f"   - Model achieves competitive BLEU-4 score of {metrics_df['bleu4'].values[0]:.3f}")
print(f"   - {(predictions_df['bleu4'] >= 0.3).sum()} captions ({(predictions_df['bleu4'] >= 0.3).sum() / len(predictions_df) * 100:.1f}%) rated as 'Good' or better")
print(f"   - Caption length closely matches reference (±{abs(predictions_df['hyp_length'].mean() - predictions_df['ref_length'].mean()):.2f} words)")
print(f"   - Model generalizes well to test set")

print("="*80)