# 05 - Model Evaluation & Analysis

This notebook provides detailed evaluation of trained models.

## What This Notebook Covers
1. Load trained models
2. Per-class performance analysis
3. Error analysis
4. Inference speed benchmarking
5. Model size comparison
6. Production readiness assessment

In [None]:
# Imports
import os
import sys
import time
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_recall_curve,
    roc_curve, auc, f1_score, precision_score, recall_score
)

import tensorflow as tf
from tensorflow import keras

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

sys.path.insert(0, str(Path.cwd().parent / 'src'))

from handflow.models import load_data, GesturePredictor
from handflow.features import FeatureEngineer
from handflow.utils import load_config

plt.style.use('seaborn-v0_8-whitegrid')
print("‚úÖ Imports loaded")

In [None]:
# Configuration
config = load_config()

MODELS_DIR = Path('../models')
DATA_PATH = Path('../data/raw/MP_Data')
if not DATA_PATH.exists():
    DATA_PATH = Path('../ModelTraining/MP_Data')

ACTIONS = config.right_hand_gestures

print(f"üìÅ Models: {MODELS_DIR}")
print(f"üìÅ Data: {DATA_PATH}")

## 1. Load Models

In [None]:
# List available models
model_files = list(MODELS_DIR.glob('*.h5')) + list(MODELS_DIR.glob('*.tflite')) + list(MODELS_DIR.glob('*.onnx'))

print("üì¶ Available models:")
for mf in model_files:
    size_mb = mf.stat().st_size / 1024 / 1024
    print(f"   {mf.name}: {size_mb:.2f} MB")

In [None]:
# Load main model
MODEL_PATH = MODELS_DIR / 'right_action.h5'

if MODEL_PATH.exists():
    model = keras.models.load_model(MODEL_PATH)
    print(f"‚úÖ Loaded model: {MODEL_PATH}")
    model.summary()
else:
    print(f"‚ùå Model not found: {MODEL_PATH}")
    print("   Run training first: python scripts/train.py --hand right")

## 2. Load Test Data

In [None]:
# Load and prepare data
print("üì• Loading data...")
sequences, labels = load_data(DATA_PATH, ACTIONS, config.model.sequence_length)

# Apply feature engineering
config.features.velocity = True
config.features.acceleration = True
config.features.finger_angles = True
config.features.hand_bbox_size = True

engineer = FeatureEngineer(config)
X = np.array([engineer.transform(seq) for seq in sequences])
y = labels

print(f"   X shape: {X.shape}")
print(f"   y shape: {y.shape}")

In [None]:
# Use all data for evaluation (or split if preferred)
from sklearn.model_selection import train_test_split

_, X_test, _, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y.argmax(axis=1)
)

print(f"üìä Test set: {X_test.shape[0]} samples")

## 3. Model Evaluation

In [None]:
# Get predictions
if 'model' in dir():
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = y_pred_proba.argmax(axis=1)
    y_true = y_test.argmax(axis=1)
    
    # Overall metrics
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    print(f"\nüìä Overall Performance:")
    print(f"   Test Loss: {test_loss:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")

In [None]:
# Classification report
print("\nüìã Classification Report:")
print(classification_report(y_true, y_pred, target_names=ACTIONS, digits=4))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=ACTIONS, yticklabels=ACTIONS, ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('True')
axes[0].set_title('Confusion Matrix (Counts)')

# Normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=ACTIONS, yticklabels=ACTIONS, ax=axes[1])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('True')
axes[1].set_title('Confusion Matrix (Normalized)')

plt.tight_layout()
plt.savefig('../docs/evaluation_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Per-Class Analysis

In [None]:
# Per-class metrics
per_class_metrics = []

for idx, action in enumerate(ACTIONS):
    mask = y_true == idx
    if mask.sum() == 0:
        continue
    
    # Binary metrics for this class
    y_true_binary = (y_true == idx).astype(int)
    y_pred_binary = (y_pred == idx).astype(int)
    
    tp = ((y_true == idx) & (y_pred == idx)).sum()
    fp = ((y_true != idx) & (y_pred == idx)).sum()
    fn = ((y_true == idx) & (y_pred != idx)).sum()
    tn = ((y_true != idx) & (y_pred != idx)).sum()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    per_class_metrics.append({
        'Gesture': action,
        'Samples': mask.sum(),
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'True Positives': tp,
        'False Positives': fp,
        'False Negatives': fn
    })

df_metrics = pd.DataFrame(per_class_metrics)
df_metrics = df_metrics.sort_values('F1-Score', ascending=False)
df_metrics

In [None]:
# Visualize per-class performance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# F1 Score by class
colors = ['green' if x > 0.9 else 'orange' if x > 0.7 else 'red' for x in df_metrics['F1-Score']]
axes[0].barh(df_metrics['Gesture'], df_metrics['F1-Score'], color=colors)
axes[0].set_xlabel('F1-Score')
axes[0].set_title('F1-Score by Gesture Class')
axes[0].axvline(0.9, color='green', linestyle='--', alpha=0.5, label='Good (0.9)')
axes[0].axvline(0.7, color='orange', linestyle='--', alpha=0.5, label='Okay (0.7)')
axes[0].legend()

# Precision vs Recall
axes[1].scatter(df_metrics['Precision'], df_metrics['Recall'], s=100)
for _, row in df_metrics.iterrows():
    axes[1].annotate(row['Gesture'], (row['Precision'], row['Recall']),
                    textcoords='offset points', xytext=(5, 5), fontsize=9)
axes[1].set_xlabel('Precision')
axes[1].set_ylabel('Recall')
axes[1].set_title('Precision vs Recall by Class')
axes[1].set_xlim(0, 1.1)
axes[1].set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig('../docs/per_class_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Error Analysis

In [None]:
# Find misclassified samples
errors = y_true != y_pred
error_indices = np.where(errors)[0]

print(f"üìä Error Analysis:")
print(f"   Total errors: {len(error_indices)} / {len(y_true)} ({len(error_indices)/len(y_true):.1%})")

# Common misclassifications
error_pairs = []
for idx in error_indices:
    error_pairs.append((ACTIONS[y_true[idx]], ACTIONS[y_pred[idx]]))

from collections import Counter
common_errors = Counter(error_pairs).most_common(10)

print("\nüòµ Most Common Misclassifications:")
for (true_label, pred_label), count in common_errors:
    print(f"   {true_label} ‚Üí {pred_label}: {count} times")

In [None]:
# Confidence analysis for errors
error_confidences = y_pred_proba[errors].max(axis=1)
correct_confidences = y_pred_proba[~errors].max(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confidence distribution
axes[0].hist(correct_confidences, bins=30, alpha=0.7, label='Correct', color='green')
axes[0].hist(error_confidences, bins=30, alpha=0.7, label='Errors', color='red')
axes[0].set_xlabel('Prediction Confidence')
axes[0].set_ylabel('Count')
axes[0].set_title('Confidence Distribution: Correct vs Errors')
axes[0].legend()

# Confidence vs accuracy at different thresholds
thresholds = np.linspace(0.5, 0.99, 20)
accuracies = []
coverages = []

for thresh in thresholds:
    mask = y_pred_proba.max(axis=1) >= thresh
    if mask.sum() > 0:
        accuracies.append((y_true[mask] == y_pred[mask]).mean())
        coverages.append(mask.mean())
    else:
        accuracies.append(np.nan)
        coverages.append(0)

axes[1].plot(thresholds, accuracies, 'b-o', label='Accuracy')
axes[1].plot(thresholds, coverages, 'g-s', label='Coverage')
axes[1].set_xlabel('Confidence Threshold')
axes[1].set_ylabel('Rate')
axes[1].set_title('Accuracy vs Coverage at Different Thresholds')
axes[1].legend()

plt.tight_layout()
plt.savefig('../docs/error_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Inference Speed Benchmarking

In [None]:
# Benchmark inference speed
def benchmark_model(model, X_sample, n_runs=100):
    """
    Benchmark model inference speed.
    """
    # Warm up
    for _ in range(10):
        model.predict(X_sample, verbose=0)
    
    # Benchmark
    times = []
    for _ in range(n_runs):
        start = time.perf_counter()
        model.predict(X_sample, verbose=0)
        times.append((time.perf_counter() - start) * 1000)  # ms
    
    return np.array(times)

# Single sample inference
single_sample = X_test[:1]
times = benchmark_model(model, single_sample)

print(f"\n‚ö° Inference Speed (single sample):")
print(f"   Mean: {times.mean():.2f} ms")
print(f"   Median: {np.median(times):.2f} ms")
print(f"   95th percentile: {np.percentile(times, 95):.2f} ms")
print(f"   Max: {times.max():.2f} ms")

In [None]:
# Visualize inference time distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(times, bins=30, edgecolor='black')
axes[0].axvline(times.mean(), color='red', linestyle='--', label=f'Mean: {times.mean():.2f}ms')
axes[0].axvline(20, color='green', linestyle='--', label='Target: 20ms')
axes[0].set_xlabel('Inference Time (ms)')
axes[0].set_ylabel('Count')
axes[0].set_title('Inference Time Distribution')
axes[0].legend()

# Box plot
axes[1].boxplot(times)
axes[1].set_ylabel('Inference Time (ms)')
axes[1].set_title('Inference Time Box Plot')

plt.tight_layout()
plt.show()

## 7. Model Size Analysis

In [None]:
# Compare model sizes
size_data = []

for mf in MODELS_DIR.glob('*.*'):
    if mf.suffix in ['.h5', '.tflite', '.onnx']:
        size_mb = mf.stat().st_size / 1024 / 1024
        size_data.append({
            'Model': mf.stem,
            'Format': mf.suffix[1:].upper(),
            'Size (MB)': size_mb
        })

df_sizes = pd.DataFrame(size_data)
print("üì¶ Model Sizes:")
print(df_sizes.to_string(index=False))

## 8. Production Readiness Assessment

In [None]:
print("="*60)
print("üìã PRODUCTION READINESS ASSESSMENT")
print("="*60)
print()

# Criteria
accuracy_threshold = 0.90
latency_threshold = 20  # ms
size_threshold = 10  # MB

# Check accuracy
accuracy_ok = test_acc >= accuracy_threshold
print(f"‚úÖ Accuracy >= {accuracy_threshold:.0%}: {test_acc:.1%} {'‚úì' if accuracy_ok else '‚úó'}")

# Check latency
latency_ok = times.mean() <= latency_threshold
print(f"‚úÖ Latency <= {latency_threshold}ms: {times.mean():.1f}ms {'‚úì' if latency_ok else '‚úó'}")

# Check size
model_size = MODEL_PATH.stat().st_size / 1024 / 1024
size_ok = model_size <= size_threshold
print(f"‚úÖ Size <= {size_threshold}MB: {model_size:.1f}MB {'‚úì' if size_ok else '‚úó'}")

# Per-class F1 check
min_f1 = df_metrics['F1-Score'].min()
f1_ok = min_f1 >= 0.7
print(f"‚úÖ Min F1 >= 0.7: {min_f1:.2f} {'‚úì' if f1_ok else '‚úó'}")

print()
overall_ready = accuracy_ok and latency_ok and size_ok and f1_ok
if overall_ready:
    print("üéâ MODEL IS PRODUCTION READY!")
else:
    print("‚ö†Ô∏è Model needs improvement before production.")
    if not accuracy_ok:
        print("   ‚Üí Improve accuracy (more data, better architecture)")
    if not latency_ok:
        print("   ‚Üí Reduce latency (quantization, smaller model)")
    if not size_ok:
        print("   ‚Üí Reduce size (quantization, pruning)")
    if not f1_ok:
        print("   ‚Üí Improve weak classes (more examples, class weights)")

## Summary

This evaluation provides:
- Overall and per-class performance metrics
- Error analysis to identify weak points
- Latency benchmarking for real-time use
- Production readiness checklist

Use these insights to:
1. Identify gestures that need more training data
2. Set appropriate confidence thresholds
3. Decide if quantization is needed for speed
4. Confirm readiness for deployment