In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create error values (difference between prediction and actual)
errors = np.linspace(-5, 5, 100)

# Calculate loss functions for INDIVIDUAL errors (not averaged)
mse_per_point = errors ** 2  # Squared error per point
mae_per_point = np.abs(errors)  # Absolute error per point

# RMSE is typically calculated as sqrt(MEAN(squared errors))
# But to show the difference, let's show the gradient/penalty curve
# This shows how much each error contributes to the loss

# Create the comparison plot
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: All three loss functions together (individual error contributions)
ax1 = axes[0, 0]
ax1.plot(errors, mse_per_point, label='MSE (Squared Error)', linewidth=2.5, color='#e74c3c')
ax1.plot(errors, mae_per_point, label='MAE (Absolute Error)', linewidth=2.5, color='#2ecc71')
# For RMSE visualization, show sqrt of squared error (which equals |error|)
# Instead, let's show the derivative to demonstrate difference in gradient
ax1.plot(errors, np.abs(errors) * 1.5, label='RMSE penalty (scaled)', linewidth=2.5, 
         color='#3498db', linestyle='--')
ax1.set_xlabel('Error (Prediction - Actual)', fontsize=12)
ax1.set_ylabel('Loss Contribution', fontsize=12)
ax1.set_title('Loss Functions: Error Contribution Comparison', fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Gradient comparison (how loss changes with error)
ax2 = axes[0, 1]
# Gradient of MSE is 2*error, gradient of MAE is sign(error)
mse_gradient = 2 * errors
mae_gradient = np.sign(errors)
ax2.plot(errors, mse_gradient, linewidth=3, color='#e74c3c', label='MSE gradient')
ax2.plot(errors, mae_gradient, linewidth=3, color='#2ecc71', label='MAE gradient')
ax2.set_xlabel('Error', fontsize=12)
ax2.set_ylabel('Gradient (penalty rate)', fontsize=12)
ax2.set_title('Loss Function Gradients', fontsize=14, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)
ax2.text(0, ax2.get_ylim()[1]*0.7, 'MSE gradient increases with error\nMAE gradient is constant', 
         ha='center', fontsize=10, style='italic', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))

# Plot 3: Actual comparison with sample data
ax3 = axes[1, 0]
# Generate sample predictions with different error patterns
np.random.seed(42)
scenario_labels = ['Few large\nerrors', 'Many small\nerrors', 'Mixed\nerrors']
mse_vals = []
rmse_vals = []
mae_vals = []

# Scenario 1: Few large errors
errors1 = np.concatenate([np.random.normal(0, 0.5, 40), np.random.normal(0, 3, 10)])
mse_vals.append(np.mean(errors1 ** 2))
rmse_vals.append(np.sqrt(np.mean(errors1 ** 2)))
mae_vals.append(np.mean(np.abs(errors1)))

# Scenario 2: Many small errors
errors2 = np.random.normal(0, 0.8, 50)
mse_vals.append(np.mean(errors2 ** 2))
rmse_vals.append(np.sqrt(np.mean(errors2 ** 2)))
mae_vals.append(np.mean(np.abs(errors2)))

# Scenario 3: Mixed
errors3 = np.random.normal(0, 1.5, 50)
mse_vals.append(np.mean(errors3 ** 2))
rmse_vals.append(np.sqrt(np.mean(errors3 ** 2)))
mae_vals.append(np.mean(np.abs(errors3)))

x = np.arange(len(scenario_labels))
width = 0.25
ax3.bar(x - width, mse_vals, width, label='MSE', color='#e74c3c', alpha=0.8)
ax3.bar(x, rmse_vals, width, label='RMSE', color='#3498db', alpha=0.8)
ax3.bar(x + width, mae_vals, width, label='MAE', color='#2ecc71', alpha=0.8)
ax3.set_xlabel('Error Pattern', fontsize=12)
ax3.set_ylabel('Loss Value', fontsize=12)
ax3.set_title('Loss Metrics on Different Error Patterns', fontsize=14, fontweight='bold')
ax3.set_xticks(x)
ax3.set_xticklabels(scenario_labels)
ax3.legend(fontsize=11)
ax3.grid(True, alpha=0.3, axis='y')

# Plot 4: Key insight - showing why RMSE differs from MAE
ax4 = axes[1, 1]
# Show cumulative effect
sample_sizes = np.arange(1, 51)
cumulative_rmse = []
cumulative_mae = []

test_errors = np.random.normal(0, 2, 50)
for n in sample_sizes:
    cumulative_rmse.append(np.sqrt(np.mean(test_errors[:n] ** 2)))
    cumulative_mae.append(np.mean(np.abs(test_errors[:n])))

ax4.plot(sample_sizes, cumulative_rmse, linewidth=3, color='#3498db', label='RMSE')
ax4.plot(sample_sizes, cumulative_mae, linewidth=3, color='#2ecc71', label='MAE')
ax4.set_xlabel('Number of samples', fontsize=12)
ax4.set_ylabel('Loss Value', fontsize=12)
ax4.set_title('RMSE vs MAE: Cumulative Behavior', fontsize=14, fontweight='bold')
ax4.legend(fontsize=11)
ax4.grid(True, alpha=0.3)
ax4.text(25, ax4.get_ylim()[1]*0.8, 'RMSE > MAE when large errors exist', 
         ha='center', fontsize=10, style='italic', bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))

plt.tight_layout()
plt.savefig('loss_functions_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Loss Functions Comparison: MSE, RMSE, and MAE

This presentation notebook provides comprehensive visualizations comparing three fundamental regression loss functions.

## Comprehensive Visualization: Multiple Perspectives

This visualization shows:
1. Error contribution curves for different loss functions
2. Gradient comparison (how quickly loss changes with error)
3. Performance on different error patterns
4. Cumulative behavior over samples

## Simplified Presentation View

A clean, focused visualization showing how **individual errors** are penalized differently by MSE vs MAE.

**Key insight:** MSE squares errors, so large errors contribute exponentially more to the loss.

## Summary: When to Use Each Loss Function

### MSE (Mean Squared Error)
- **Formula**: $\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$
- **Characteristics**: Squares the errors, heavily penalizing large errors
- **When to use**: When large errors are particularly undesirable (e.g., safety-critical applications)
- **Disadvantage**: Not in the same units as the original data (squared units)

### RMSE (Root Mean Squared Error)
- **Formula**: $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
- **Characteristics**: Square root of MSE, returns error to original scale
- **When to use**: When you want MSE's outlier sensitivity but in interpretable units
- **Advantage**: Same units as the target variable, easier to interpret

### MAE (Mean Absolute Error)
- **Formula**: $\frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i|$
- **Characteristics**: Linear penalty for all errors
- **When to use**: When all errors should be weighted equally, or when outliers shouldn't dominate
- **Advantage**: More robust to outliers than MSE/RMSE

### Key Takeaway
**RMSE ‚â• MAE** always, with equality only when all errors have the same magnitude. The gap between them indicates the presence of large errors/outliers.

In [None]:
# Simple clean plot for presentation - showing individual error contributions
fig, ax = plt.subplots(figsize=(12, 7))

errors_range = np.linspace(-5, 5, 100)

# Plot the error contribution curves (for INDIVIDUAL errors, not averaged)
sns.lineplot(x=errors_range, y=errors_range**2, label='MSE: Squared Error (e¬≤)', 
             linewidth=3, ax=ax, color='#e74c3c')
sns.lineplot(x=errors_range, y=np.abs(errors_range), label='MAE: Absolute Error (|e|)', 
             linewidth=3, ax=ax, color='#2ecc71')

ax.set_xlabel('Error (Prediction - Actual)', fontsize=14, fontweight='bold')
ax.set_ylabel('Loss Contribution per Sample', fontsize=14, fontweight='bold')
ax.set_title('MSE vs MAE: How Individual Errors are Penalized', fontsize=16, fontweight='bold', pad=20)
ax.legend(fontsize=12, loc='upper center')
ax.axvline(x=0, color='black', linestyle='--', alpha=0.3, linewidth=1)
ax.axhline(y=0, color='black', linestyle='--', alpha=0.3, linewidth=1)

# Add annotations
ax.annotate('Large errors penalized\nmuch more heavily', xy=(3, 9), xytext=(1.5, 15),
            arrowprops=dict(arrowstyle='->', color='#e74c3c', lw=2),
            fontsize=11, color='#e74c3c', fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='#e74c3c', alpha=0.8))

ax.annotate('Linear penalty\nfor all errors', xy=(3, 3), xytext=(3.5, 7),
            arrowprops=dict(arrowstyle='->', color='#2ecc71', lw=2),
            fontsize=11, color='#2ecc71', fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='#2ecc71', alpha=0.8))

# Add text box explaining RMSE
textstr = 'Note: RMSE = ‚àö(MSE averaged over all samples)\n' \
          'RMSE always falls between MAE and MSE values\n' \
          'See cell 3 for RMSE comparison!'
props = dict(boxstyle='round', facecolor='lightblue', alpha=0.3)
ax.text(0.02, 0.98, textstr, transform=ax.transAxes, fontsize=10,
        verticalalignment='top', bbox=props, style='italic')

plt.tight_layout()
plt.savefig('loss_functions_simple.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "="*60)
print("Why only MSE and MAE are shown on this plot:")
print("="*60)
print("‚Ä¢ This plot shows loss contribution for INDIVIDUAL errors")
print("‚Ä¢ For a single error e:")
print("  - MSE contribution = e¬≤")
print("  - MAE contribution = |e|")
print("  - RMSE is only meaningful when averaged over multiple errors!")
print("\n‚Ä¢ RMSE = sqrt(mean(e¬≤)), so it requires averaging first")
print("‚Ä¢ See Cell 3 for RMSE vs MAE comparison with real data!")
print("="*60)

In [None]:
# Demonstration: Why RMSE ‚â† MAE (the key difference for students!)
import pandas as pd

print("=" * 70)
print("KEY CONCEPT: RMSE vs MAE difference appears when AVERAGING errors")
print("=" * 70)

# Example 1: Set of predictions with different error patterns
print("\nüìä Example 1: Two models with different error patterns\n")

# Model A: Consistent small errors
errors_model_a = np.array([1, 1, 1, 1, 1])
print(f"Model A errors: {errors_model_a}")
print(f"  MAE  = mean(|errors|) = {np.mean(np.abs(errors_model_a)):.3f}")
print(f"  MSE  = mean(errors¬≤)  = {np.mean(errors_model_a**2):.3f}")
print(f"  RMSE = sqrt(MSE)      = {np.sqrt(np.mean(errors_model_a**2)):.3f}")

# Model B: One large error, rest small
errors_model_b = np.array([0, 0, 0, 0, 5])
print(f"\nModel B errors: {errors_model_b}")
print(f"  MAE  = mean(|errors|) = {np.mean(np.abs(errors_model_b)):.3f}")
print(f"  MSE  = mean(errors¬≤)  = {np.mean(errors_model_b**2):.3f}")
print(f"  RMSE = sqrt(MSE)      = {np.sqrt(np.mean(errors_model_b**2)):.3f}")

print("\nüí° Notice: Both have same MAE (1.0), but RMSE is much higher for Model B!")
print("   This is because RMSE penalizes the large error (5) much more heavily.")

# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Plot 1: Show the errors
ax1 = axes[0]
x_pos = np.arange(5)
width = 0.35
ax1.bar(x_pos - width/2, errors_model_a, width, label='Model A', color='#3498db', alpha=0.8)
ax1.bar(x_pos + width/2, errors_model_b, width, label='Model B', color='#e74c3c', alpha=0.8)
ax1.set_xlabel('Prediction Number', fontsize=12)
ax1.set_ylabel('Error Value', fontsize=12)
ax1.set_title('Error Distribution', fontsize=14, fontweight='bold')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Plot 2: Show the metrics comparison
ax2 = axes[1]
metrics = ['MAE', 'RMSE', 'MSE']
model_a_metrics = [
    np.mean(np.abs(errors_model_a)),
    np.sqrt(np.mean(errors_model_a**2)),
    np.mean(errors_model_a**2)
]
model_b_metrics = [
    np.mean(np.abs(errors_model_b)),
    np.sqrt(np.mean(errors_model_b**2)),
    np.mean(errors_model_b**2)
]

x_pos = np.arange(len(metrics))
width = 0.35
bars1 = ax2.bar(x_pos - width/2, model_a_metrics, width, label='Model A (consistent)', 
                color='#3498db', alpha=0.8)
bars2 = ax2.bar(x_pos + width/2, model_b_metrics, width, label='Model B (1 outlier)', 
                color='#e74c3c', alpha=0.8)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax2.set_ylabel('Loss Value', fontsize=12)
ax2.set_title('Loss Metrics Comparison', fontsize=14, fontweight='bold')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(metrics)
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Plot 3: Mathematical explanation
ax3 = axes[2]
ax3.axis('off')
explanation = """
Why RMSE ‚â† MAE:

MAE Formula:
MAE = (|e‚ÇÅ| + |e‚ÇÇ| + ... + |e‚Çô|) / n

RMSE Formula:
RMSE = ‚àö[(e‚ÇÅ¬≤ + e‚ÇÇ¬≤ + ... + e‚Çô¬≤) / n]

Key Difference:
‚Ä¢ Squaring before averaging emphasizes 
  larger errors
‚Ä¢ Square root brings back to original 
  scale, but the emphasis remains

Model A: (1¬≤+1¬≤+1¬≤+1¬≤+1¬≤)/5 = 1.0
         ‚àö1.0 = 1.0

Model B: (0¬≤+0¬≤+0¬≤+0¬≤+5¬≤)/5 = 5.0
         ‚àö5.0 = 2.236

The large error (5) dominates RMSE!
"""
ax3.text(0.1, 0.5, explanation, fontsize=11, family='monospace',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
         verticalalignment='center')

plt.tight_layout()
plt.savefig('rmse_vs_mae_explained.png', dpi=300, bbox_inches='tight')
plt.show()

# Additional example with realistic prediction errors
print("\n" + "=" * 70)
print("üìä Example 2: Realistic prediction scenario")
print("=" * 70)

np.random.seed(42)
# Create realistic errors
normal_errors = np.random.normal(0, 1, 20)
errors_with_outliers = np.concatenate([np.random.normal(0, 1, 18), np.array([5, -6])])

comparison_df = pd.DataFrame({
    'Dataset': ['Normal errors', 'With 2 outliers'],
    'MAE': [np.mean(np.abs(normal_errors)), np.mean(np.abs(errors_with_outliers))],
    'RMSE': [np.sqrt(np.mean(normal_errors**2)), np.sqrt(np.mean(errors_with_outliers**2))],
    'MSE': [np.mean(normal_errors**2), np.mean(errors_with_outliers**2)]
})

print(comparison_df.to_string(index=False))
print("\nüí° RMSE increases more than MAE when outliers are present!")
print("   This makes RMSE more sensitive to large errors.")

## Key Differences for Students:

### MSE (Mean Squared Error)
- **Formula**: $\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$
- **Characteristics**: Squares the errors, heavily penalizing large errors
- **When to use**: When large errors are particularly undesirable
- **Disadvantage**: Not in the same units as the original data

### RMSE (Root Mean Squared Error)
- **Formula**: $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
- **Characteristics**: Square root of MSE, returns error to original scale
- **When to use**: When you want MSE's properties but in interpretable units
- **Advantage**: Same units as the target variable

### MAE (Mean Absolute Error)
- **Formula**: $\frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i|$
- **Characteristics**: Linear penalty for all errors
- **When to use**: When all errors should be weighted equally
- **Advantage**: More robust to outliers than MSE/RMSE