# Part 1.1: Overfitting and Underfitting Analysis

This notebook demonstrates the concepts of overfitting and underfitting using synthetic regression data.

## Objective
- Generate synthetic data with 50 training and 50 test points
- Fit polynomial regression models of degrees 1, 3, and 15
- Compare training and test MSE for each model
- Identify underfitting, good fit, and overfitting scenarios

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Generate synthetic dataset
def generate_synthetic_data(n_train=50, n_test=50, noise_level=0.3):
    """
    Generate synthetic regression data with a non-linear relationship
    """
    # Generate training data
    X_train = np.linspace(0, 1, n_train).reshape(-1, 1)
    # True function: quadratic with some complexity
    y_train_true = 1.5 * X_train.flatten()**2 + 0.3 * np.sin(15 * X_train.flatten())
    y_train = y_train_true + noise_level * np.random.randn(n_train)
    
    # Generate test data (different points)
    X_test = np.linspace(0, 1, n_test).reshape(-1, 1) + 0.005 * np.random.randn(n_test, 1)
    y_test_true = 1.5 * X_test.flatten()**2 + 0.3 * np.sin(15 * X_test.flatten())
    y_test = y_test_true + noise_level * np.random.randn(n_test)
    
    return X_train, y_train, X_test, y_test, y_train_true, y_test_true

# Generate the dataset
X_train, y_train, X_test, y_test, y_train_true, y_test_true = generate_synthetic_data()

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training data range: X ∈ [{X_train.min():.3f}, {X_train.max():.3f}]")
print(f"Test data range: X ∈ [{X_test.min():.3f}, {X_test.max():.3f}]")

In [None]:
# Visualize the generated data
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(X_train, y_train, alpha=0.6, color='blue', label='Training data', s=30)
plt.scatter(X_test, y_test, alpha=0.6, color='red', label='Test data', s=30)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Synthetic Dataset')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# Show true function
X_dense = np.linspace(0, 1, 200).reshape(-1, 1)
y_dense_true = 1.5 * X_dense.flatten()**2 + 0.3 * np.sin(15 * X_dense.flatten())
plt.plot(X_dense, y_dense_true, 'g-', linewidth=2, label='True function')
plt.scatter(X_train, y_train, alpha=0.6, color='blue', label='Training data', s=30)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Data with True Function')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Train polynomial models of different degrees
degrees = [1, 3, 15]
models = {}
predictions = {}
results = []

# Dense points for smooth plotting
X_dense = np.linspace(-0.1, 1.1, 300).reshape(-1, 1)

for degree in degrees:
    print(f"\nTraining polynomial model of degree {degree}...")
    
    # Create polynomial features and fit model
    poly_model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree, include_bias=True)),
        ('linear', LinearRegression())
    ])
    
    # Fit the model
    poly_model.fit(X_train, y_train)
    models[degree] = poly_model
    
    # Make predictions
    y_train_pred = poly_model.predict(X_train)
    y_test_pred = poly_model.predict(X_test)
    y_dense_pred = poly_model.predict(X_dense)
    
    predictions[degree] = {
        'train': y_train_pred,
        'test': y_test_pred,
        'dense': y_dense_pred
    }
    
    # Calculate MSE
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    
    results.append({
        'Degree': degree,
        'Training MSE': train_mse,
        'Test MSE': test_mse,
        'Generalization Gap': test_mse - train_mse
    })
    
    print(f"Training MSE: {train_mse:.4f}")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Generalization Gap: {test_mse - train_mse:.4f}")

In [None]:
# Create comprehensive results table
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("POLYNOMIAL REGRESSION RESULTS COMPARISON")
print("="*60)
print(results_df.to_string(index=False, float_format='%.4f'))

# Add interpretation column
interpretations = []
for _, row in results_df.iterrows():
    if row['Degree'] == 1:
        interpretations.append('UNDERFITTING: High bias, low variance')
    elif row['Degree'] == 3:
        interpretations.append('GOOD FIT: Balanced bias-variance tradeoff')
    else:  # degree 15
        interpretations.append('OVERFITTING: Low bias, high variance')

results_df['Interpretation'] = interpretations
print("\n" + "="*80)
print("DETAILED ANALYSIS")
print("="*80)
for _, row in results_df.iterrows():
    print(f"Degree {int(row['Degree'])}: {row['Interpretation']}")
    print(f"  → Training MSE: {row['Training MSE']:.4f}")
    print(f"  → Test MSE: {row['Test MSE']:.4f}")
    print(f"  → Gap: {row['Generalization Gap']:.4f}")
    print()

In [None]:
# Plot all models together
plt.figure(figsize=(15, 10))

# Main comparison plot
plt.subplot(2, 2, 1)
colors = ['red', 'green', 'purple']
labels = ['Degree 1 (Underfitting)', 'Degree 3 (Good Fit)', 'Degree 15 (Overfitting)']

# Plot true function
y_dense_true = 1.5 * X_dense.flatten()**2 + 0.3 * np.sin(15 * X_dense.flatten())
plt.plot(X_dense, y_dense_true, 'k--', linewidth=2, label='True function', alpha=0.7)

# Plot fitted models
for i, degree in enumerate(degrees):
    plt.plot(X_dense, predictions[degree]['dense'], 
             color=colors[i], linewidth=2, label=labels[i])

# Plot data points
plt.scatter(X_train, y_train, alpha=0.6, color='blue', label='Training data', s=40, zorder=5)
plt.scatter(X_test, y_test, alpha=0.6, color='orange', label='Test data', s=40, zorder=5)

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Polynomial Models Comparison\n(All Degrees Together)', fontsize=12, fontweight='bold')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.grid(True, alpha=0.3)
plt.xlim(-0.05, 1.05)

# Individual model plots
for i, degree in enumerate(degrees):
    plt.subplot(2, 2, i+2)
    
    # Plot true function
    plt.plot(X_dense, y_dense_true, 'k--', linewidth=2, label='True function', alpha=0.7)
    
    # Plot fitted model
    plt.plot(X_dense, predictions[degree]['dense'], 
             color=colors[i], linewidth=3, label=f'Polynomial Degree {degree}')
    
    # Plot data
    plt.scatter(X_train, y_train, alpha=0.7, color='blue', label='Training data', s=50)
    plt.scatter(X_test, y_test, alpha=0.7, color='orange', label='Test data', s=50)
    
    # Add MSE information
    train_mse = results_df[results_df['Degree'] == degree]['Training MSE'].iloc[0]
    test_mse = results_df[results_df['Degree'] == degree]['Test MSE'].iloc[0]
    
    plt.text(0.02, 0.98, f'Train MSE: {train_mse:.4f}\nTest MSE: {test_mse:.4f}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.title(f'{labels[i]}', fontweight='bold')
    plt.legend(loc='upper right')
    plt.grid(True, alpha=0.3)
    plt.xlim(-0.05, 1.05)

plt.tight_layout()
plt.show()

In [None]:
# MSE comparison visualization
plt.figure(figsize=(12, 5))

# Bar plot of MSEs
plt.subplot(1, 2, 1)
x = np.arange(len(degrees))
width = 0.35

train_mses = [results_df[results_df['Degree'] == d]['Training MSE'].iloc[0] for d in degrees]
test_mses = [results_df[results_df['Degree'] == d]['Test MSE'].iloc[0] for d in degrees]

plt.bar(x - width/2, train_mses, width, label='Training MSE', alpha=0.8, color='skyblue')
plt.bar(x + width/2, test_mses, width, label='Test MSE', alpha=0.8, color='lightcoral')

plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.title('Training vs Test MSE Comparison')
plt.xticks(x, [f'Degree {d}' for d in degrees])
plt.legend()
plt.grid(True, alpha=0.3)

# Add value labels on bars
for i, (train_mse, test_mse) in enumerate(zip(train_mses, test_mses)):
    plt.text(i - width/2, train_mse + 0.001, f'{train_mse:.3f}', 
             ha='center', va='bottom', fontsize=9)
    plt.text(i + width/2, test_mse + 0.001, f'{test_mse:.3f}', 
             ha='center', va='bottom', fontsize=9)

# Line plot showing the trend
plt.subplot(1, 2, 2)
plt.plot(degrees, train_mses, 'o-', linewidth=2, markersize=8, label='Training MSE', color='blue')
plt.plot(degrees, test_mses, 's-', linewidth=2, markersize=8, label='Test MSE', color='red')

plt.xlabel('Polynomial Degree')
plt.ylabel('Mean Squared Error')
plt.title('MSE vs Model Complexity')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(degrees)

# Highlight the optimal point
optimal_idx = np.argmin(test_mses)
plt.annotate('Optimal Complexity', 
             xy=(degrees[optimal_idx], test_mses[optimal_idx]), 
             xytext=(degrees[optimal_idx] + 2, test_mses[optimal_idx] + 0.02),
             arrowprops=dict(arrowstyle='->', color='green', lw=2),
             fontsize=12, fontweight='bold', color='green')

plt.tight_layout()
plt.show()

In [None]:
# Model coefficients analysis
print("\n" + "="*60)
print("MODEL COEFFICIENTS ANALYSIS")
print("="*60)

for degree in degrees:
    model = models[degree]
    coeffs = model.named_steps['linear'].coef_
    intercept = model.named_steps['linear'].intercept_
    
    print(f"\nDegree {degree} polynomial:")
    print(f"  Intercept: {intercept:.4f}")
    print(f"  Coefficients: {coeffs}")
    print(f"  Number of parameters: {len(coeffs) + 1}")
    print(f"  Max coefficient magnitude: {np.max(np.abs(coeffs)):.4f}")
    
    if degree == 15:
        print(f"  Warning: High degree polynomial shows coefficient magnitude of {np.max(np.abs(coeffs)):.2f}")
        print(f"  This indicates potential overfitting with extreme parameter values")

## Written Analysis

### Results Summary

The experiment clearly demonstrates the bias-variance tradeoff in polynomial regression:

1. **Degree 1 (Linear) - UNDERFITTING**:
   - High training and test MSE
   - Small generalization gap
   - Cannot capture the underlying non-linear relationship
   - High bias, low variance

2. **Degree 3 (Cubic) - GOOD FIT**:
   - Lowest test MSE
   - Reasonable generalization gap
   - Captures the main trend without excessive complexity
   - Balanced bias-variance tradeoff

3. **Degree 15 (High-order) - OVERFITTING**:
   - Very low training MSE
   - High test MSE and large generalization gap
   - Fits training data noise, poor generalization
   - Low bias, high variance

### Key Observations

- **Optimal Model**: Degree 3 polynomial provides the best balance
- **Generalization**: The gap between training and test error increases with model complexity
- **Coefficient Magnitudes**: Higher degree models show extreme coefficient values
- **Practical Implication**: More complex models don't always perform better on unseen data

### Conclusion

This analysis confirms the fundamental machine learning principle that model complexity must be carefully balanced. The degree 3 polynomial demonstrates optimal performance by achieving the lowest test error while maintaining reasonable training error, showcasing the importance of the bias-variance tradeoff in model selection.