# Improved MDA-CNN: Complete Implementation

This notebook contains the complete improved implementation of the MDA-CNN model, addressing the overfitting issues in the original notebook.

## Key Improvements:
- **Reduced parameters**: 6,791 → ~50-241 (96% reduction)
- **Added regularization**: Dropout, BatchNorm, L2 regularization
- **Proper training**: Early stopping, validation split, learning rate scheduling
- **Multiple architectures**: CNN, MLP, and ultra-simple models
- **Comprehensive visualization**: Model predictions, errors, and comparisons


## 1. Import Libraries and Setup


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from scipy.interpolate import CubicSpline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)


TensorFlow version: 2.20.0
NumPy version: 2.3.3


## 2. Data Processing Functions


In [2]:
def normalize_data(data, qh_min, qh_max):
    """Normalizes data based on the high-fidelity data range."""
    if qh_max == qh_min:
        return np.full_like(data, np.nan, dtype=np.float64)
    return (data - qh_min) / (qh_max - qh_min)

def denormalize_data(normalized_data, qh_min, qh_max):
    """Denormalizes data that was previously normalized."""
    if qh_max == qh_min:
        if np.any(np.isnan(normalized_data)):
            return np.full_like(normalized_data, np.nan, dtype=np.float64)
        return np.full_like(normalized_data, qh_min, dtype=np.float64)
    return normalized_data * (qh_max - qh_min) + qh_min

def compile_input_table(hf_y, hf_q_l, lf_y, lf_q):
    """Compiles the structured input table for a single high-fidelity point."""
    n_l = lf_y.shape[0]
    hf_y_arr = np.atleast_1d(hf_y)
    d_y = hf_y_arr.shape[0]
    
    if d_y > 1:
        if lf_y.ndim == 1:
            lf_y_arr = lf_y.reshape(-1, 1)
        elif lf_y.ndim == 2 and lf_y.shape[1] == d_y:
            lf_y_arr = lf_y
        else:
            raise ValueError(f"lf_y shape {lf_y.shape} incompatible with hf_y dimension {d_y}")
    else:
        lf_y_arr = lf_y.reshape(-1, 1)
    
    lf_q_arr = lf_q.reshape(-1, 1)
    hf_y_repeated = np.tile(hf_y_arr, (n_l, 1))
    hf_q_l_repeated = np.full((n_l, 1), hf_q_l)
    
    input_table = np.hstack((lf_y_arr, lf_q_arr, hf_y_repeated, hf_q_l_repeated))
    return input_table


## 3. Improved Model Architecture (FIXED VERSION)


In [None]:
def build_improved_cnn(input_table_shape, num_filters=16, kernel_sizes=(3,5,7), dnn_units=(32,16), dropout_rate=0.3):
    """
    Deeper multi-branch MDA-CNN with residual linear path.
    - Parallel Conv1D branches with different receptive fields over LF rows
    - Stacked conv blocks with BatchNorm and Dropout
    - Global pooling + small DNN head
    - Linear skip path from LF-at-HF and HF location features
    The interface remains compatible with previous training code.
    """
    input_tensor = keras.Input(shape=input_table_shape, name='Input_Table')  # shape: (NL, C)

    # Split columns: [y_L, Q_L(y_L), y_H_i, Q_L(y_H_i)]
    # We create simple linear features by pooling the last two columns across rows.
    y_h_col = layers.Lambda(lambda x: x[:,:,2:3], name='HF_Location_Col')(input_tensor)
    ql_at_h_col = layers.Lambda(lambda x: x[:,:,3:4], name='LF_at_HF_Col')(input_tensor)
    
    # Linear features: average across LF rows (point-to-all summary)
    y_h_feat = layers.GlobalAveragePooling1D(name='HF_Location_GlobalAvg')(y_h_col)
    ql_at_h_feat = layers.GlobalAveragePooling1D(name='LF_at_HF_GlobalAvg')(ql_at_h_col)
    linear_feat = layers.Concatenate(name='Linear_Features')([y_h_feat, ql_at_h_feat])
    linear_out = layers.Dense(1, activation='linear', name='Linear_Skip')(linear_feat)

    # Nonlinear conv branches over full input table
    def conv_block(x, filters, kernel_size, name_prefix):
        x = layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation=None, name=f'{name_prefix}_conv1')(x)
        x = layers.BatchNormalization(name=f'{name_prefix}_bn1')(x)
        x = layers.Activation('relu', name=f'{name_prefix}_relu1')(x)
        x = layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same', activation=None, name=f'{name_prefix}_conv2')(x)
        x = layers.BatchNormalization(name=f'{name_prefix}_bn2')(x)
        x = layers.Activation('relu', name=f'{name_prefix}_relu2')(x)
        x = layers.SpatialDropout1D(rate=0.1, name=f'{name_prefix}_sdrop')(x)
        return x

    branches = []
    for i, k in enumerate(kernel_sizes):
        b = conv_block(input_tensor, filters=num_filters, kernel_size=k, name_prefix=f'Branch{i+1}_k{k}')
        branches.append(b)

    x = layers.Concatenate(name='Concat_Branches')(branches)
    x = conv_block(x, filters=num_filters*2, kernel_size=3, name_prefix='Fusion')

    # Global pooling to aggregate across LF rows
    gap = layers.GlobalAveragePooling1D(name='GlobalAvgPool')(x)

    # DNN head for nonlinear residual mapping
    dnn = gap
    for i, units in enumerate(dnn_units):
        dnn = layers.Dense(units, activation='relu', kernel_regularizer=keras.regularizers.l2(1e-4), name=f'Dense_{i+1}')(dnn)
        dnn = layers.Dropout(dropout_rate, name=f'Dropout_{i+1}')(dnn)
    nonlinear_out = layers.Dense(1, activation='linear', name='Nonlinear_Head')(dnn)

    # Combine linear skip and nonlinear residual
    output = layers.Add(name='Output_Sum')([linear_out, nonlinear_out])

    model = keras.Model(inputs=input_tensor, outputs=output, name='MDA_CNN_DeepMultiBranch')
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    return model

print("Defined deeper multi-branch MDA-CNN with residual linear path.")


ORIGINAL vs IMPROVED MODEL COMPARISON:
Original Model Issues:
- 6,791 parameters for 5 training samples
- Parameter-to-sample ratio: 1,358:1 (SEVERE overfitting)
- No regularization (dropout, batch norm, weight decay)
- 2,600 epochs with no early stopping
- No validation split
- Final training loss: ~0.0000 (perfect overfitting)

Improved Model Benefits:
- ~50-241 parameters (96% reduction)
- Parameter-to-sample ratio: ~10:1 (much better)
- Added regularization: Dropout, BatchNorm, L2
- Early stopping and validation
- Appropriate for small datasets
- Better generalization


## 4. SABR MC Data

In [None]:
## 4a. Legacy dataset arrays (from legacy/multi_fid.ipynb)

import numpy as np

Q_x = np.array([
    0.4, 0.485, 0.57, 0.655, 0.74, 0.825, 0.91, 0.995, 1.08, 1.165,
    1.25, 1.335, 1.42, 1.505, 1.59, 1.675, 1.76, 1.845, 1.93, 2.015, 2.1
])

Q_H_y_3 = np.array([
    0.6350203, 0.6041921, 0.579231, 0.5584657, 0.5409, 0.5258056, 0.5127053,
    0.5012174, 0.4910923, 0.4820835, 0.4740534, 0.466857, 0.4604359, 0.4547031,
    0.4495232, 0.4448568, 0.4406356, 0.4368164, 0.4333387, 0.4301594, 0.4272637
])

Q_H_y_20 = np.array([
    0.446000, 0.431400, 0.419000, 0.408500, 0.399300, 0.391300, 0.384200,
    0.378000, 0.372400, 0.367300, 0.362800, 0.358700, 0.355000, 0.351700,
    0.348600, 0.345800, 0.343300, 0.341000, 0.338900, 0.336900, 0.335100
])

Q_L_y = np.array([
    0.649120, 0.616192, 0.589531, 0.567466, 0.548800, 0.532806, 0.519005,
    0.507017, 0.496392, 0.487084, 0.478853, 0.471557, 0.465036, 0.459203,
    0.454023, 0.449257, 0.445036, 0.441216, 0.437839, 0.434759, 0.431864
])

## Data generation using legacy arrays

def generate_from_legacy(T=3, n_l_samples=40, n_h_samples=5, use_even_indices=True):
    """
    Build LF/HF using the legacy arrays with residual learning.
    - LF: Hagan SABR approximation sabr_implied_vol(y, T)
    - HF: cubic spline through (Q_x, Q_H_y_T)
    Targets: residual r = HF - LF at HF locations.
    """
    from scipy.interpolate import CubicSpline
    assert T in (3, 20), "T must be 3 or 20"
    hf_arr = Q_H_y_3 if T == 3 else Q_H_y_20

    # Define grids
    lf_y_all = Q_x.copy()
    lf_cs = CubicSpline(Q_x, Q_L_y)
    hf_cs = CubicSpline(Q_x, hf_arr)

    # Optionally subsample LF grid to n_l_samples (keep coverage)
    if n_l_samples < len(lf_y_all):
        idx_l = np.linspace(0, len(lf_y_all)-1, n_l_samples, dtype=int)
        lf_y_all = lf_y_all[idx_l]
    lf_q_all = lf_cs(lf_y_all)

    # Choose HF training points on the same x-grid
    if use_even_indices:
        idx_h = np.linspace(0, len(Q_x)-1, n_h_samples, dtype=int)
    else:
        rng = np.random.default_rng(42)
        idx_h = np.sort(rng.choice(np.arange(len(Q_x)), size=n_h_samples, replace=False))
    hf_y_train = Q_x[idx_h]

    lf_at_hf = lf_cs(hf_y_train)
    hf_q_train = hf_cs(hf_y_train)

    r_min, r_max = float(np.min(hf_q_train - lf_at_hf)), float(np.max(hf_q_train - lf_at_hf))

    y_train = normalize_data(hf_q_train - lf_at_hf, r_min, r_max)

    # Normalize LF for tables using residual range
    lf_q_all_normalized = normalize_data(lf_q_all, r_min, r_max)

    X_train_list = []
    for i in range(n_h_samples):
        hf_y_i = hf_y_train[i]
        hf_q_l_i_norm = normalize_data(lf_at_hf[i], r_min, r_max)
        table_i = compile_input_table(hf_y_i, hf_q_l_i_norm, lf_y_all, lf_q_all_normalized)
        X_train_list.append(table_i)

    X_train = np.array(X_train_list)
    input_shape = (len(lf_y_all), 4)

    return {
        'X_train': X_train,
        'y_train': y_train,
        'hf_y_train': hf_y_train,
        'hf_q_train': hf_q_train,
        'lf_at_hf': lf_at_hf,
        'lf_y_all': lf_y_all,
        'lf_q_all': lf_q_all,
        'input_shape': input_shape,
        'r_min': r_min,
        'r_max': r_max,
        'T': T,
        'lf_cs': lf_cs,
        'hf_cs': hf_cs,
    }


In [None]:
def sabr_implied_vol(K, T=3, F=1, alpha=0.5, beta=0.6, rho=-0.2, nu=0.3):
    """Computes the SABR-implied Black volatility using Hagan's approximation."""
    K = np.asarray(K, dtype=np.float64)
    eps = 1e-07
    atm = np.abs(F - K) < eps

    # ATM branch
    vol_atm = (alpha / (F**(1-beta))) * (
        1 + (
            ((1-beta)**2 / 24) * (alpha**2 / (F**(2-2*beta))) +
            (rho * beta * nu * alpha) / (4 * (F**(1-beta))) +
            ((2-3*rho**2) / 24) * nu**2
        ) * T
    )

    # Non-ATM branch
    log_fk = np.log(F / K)
    fk_beta = (F * K)**((1-beta)/2)
    z = (nu / alpha) * fk_beta * log_fk
    sqrt_expr = np.sqrt(1 - 2 * rho * z + z**2)
    x_z = np.where(
        np.abs(z) > eps,
        np.log((sqrt_expr + z - rho) / (1 - rho)),
        z - 0.5 * rho * z**2
    )
    term1 = ((1-beta)**2 / 24) * (alpha**2 / (fk_beta**2))
    term2 = (rho * beta * nu * alpha) / (4 * fk_beta)
    term3 = ((2-3*rho**2) / 24) * nu**2
    vol_nonatm = (alpha / (((F * K)**((1-beta)/2)) * (1 + ((1-beta)**2/24)*(log_fk**2) + ((1-beta)**4/1920)*(log_fk**4)))) \
                 * (z / x_z) * (1 + (term1 + term2 + term3) * T)

    vol = np.where(atm, vol_atm, vol_nonatm)
    if vol.size == 1:
        return float(vol)
    return vol


## 5. Additional Model Architectures


In [5]:
def build_simple_mlp(input_table_shape, hidden_units=[16, 8], dropout_rate=0.3):
    """Simple MLP baseline model."""
    input_tensor = keras.Input(shape=input_table_shape, name='Input_Table')
    flatten = layers.Flatten()(input_tensor)
    
    # Hidden layers
    x = flatten
    for i, units in enumerate(hidden_units):
        x = layers.Dense(
            units=units,
            activation='relu',
            kernel_regularizer=keras.regularizers.l2(0.001),
            name=f'Dense_{i+1}'
        )(x)
        x = layers.Dropout(dropout_rate, name=f'Dropout_{i+1}')(x)
    
    # Output layer
    output = layers.Dense(1, activation='linear', name='Output')(x)
    
    model = keras.Model(inputs=input_tensor, outputs=output, name='Simple_MLP')
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    
    return model

def build_ultra_simple_model(input_table_shape):
    """Ultra-simple model for very small datasets."""
    input_tensor = keras.Input(shape=input_table_shape, name='Input_Table')
    
    # Global average pooling
    gap = layers.GlobalAveragePooling1D()(input_tensor)
    
    # Single dense layer
    output = layers.Dense(1, activation='linear', name='Output')(gap)
    
    model = keras.Model(inputs=input_tensor, outputs=output, name='Ultra_Simple')
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    
    return model


## 6. Training Functions with Proper Validation


In [6]:
def train_with_validation(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=2):
    """
    IMPROVED training with validation and early stopping.
    
    Key improvements over original:
    - Early stopping prevents overfitting
    - Learning rate scheduling
    - Proper validation monitoring
    - Appropriate batch size for small datasets
    """
    # Callbacks for better training
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )
    
    lr_scheduler = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, lr_scheduler],
        verbose=1
    )
    
    return history


In [None]:
## 7 Parameterized Data Generation (Residual Learning)

def generate_sabr_data_param(T=3, y_min=0.4, y_max=2.1, n_l_samples=40, n_h_samples=5):
    """
    Generate SABR LF/HF data for a given maturity T.
    LF via Hagan(T). HF = LF + synthetic residual.
    Train on residual r = HF - LF.
    """
    lf_y_all = np.linspace(y_min-0.1, y_max+0.5, n_l_samples)
    lf_q_all = sabr_implied_vol(lf_y_all, T)

    hf_y_train = np.linspace(y_min, y_max, n_h_samples)
    lf_at_hf = sabr_implied_vol(hf_y_train, T)

    residual_train = 0.01 * np.sin(8 * hf_y_train + 0.1*T) + 0.005 * (hf_y_train - 1.25)**2
    hf_q_train = lf_at_hf + residual_train

    r_min = float(np.min(residual_train))
    r_max = float(np.max(residual_train))

    y_train = normalize_data(residual_train, r_min, r_max)
    lf_q_all_normalized = normalize_data(lf_q_all, r_min, r_max)

    X_train_list = []
    for i in range(n_h_samples):
        hf_y_i = hf_y_train[i]
        hf_q_l_i_norm = normalize_data(lf_at_hf[i], r_min, r_max)
        table_i = compile_input_table(hf_y_i, hf_q_l_i_norm, lf_y_all, lf_q_all_normalized)
        X_train_list.append(table_i)

    X_train = np.array(X_train_list)
    input_shape = (n_l_samples, 4)

    return {
        'X_train': X_train,
        'y_train': y_train,
        'hf_y_train': hf_y_train,
        'hf_q_train': hf_q_train,
        'lf_at_hf': lf_at_hf,
        'lf_y_all': lf_y_all,
        'lf_q_all': lf_q_all,
        'input_shape': input_shape,
        'r_min': r_min,
        'r_max': r_max,
        'T': T,
    }


def train_models_for_dataset(input_shape, X_train, y_train, epochs=50, batch_size=2):
    n = len(X_train)
    idx = np.arange(n)
    np.random.shuffle(idx)
    n_val = max(1, n//5)
    val_idx = idx[:n_val]
    tr_idx = idx[n_val:]

    X_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    models = {}
    histories = {}

    # Updated call: use new deeper multi-branch MDA-CNN with defaults
    cnn = build_improved_cnn(input_shape)
    hist = train_with_validation(cnn, X_tr, y_tr, X_val, y_val, epochs=epochs, batch_size=batch_size)
    models['improved_cnn'] = cnn
    histories['improved_cnn'] = hist

    mlp = build_simple_mlp(input_shape, hidden_units=[16,8])
    hist = train_with_validation(mlp, X_tr, y_tr, X_val, y_val, epochs=epochs, batch_size=batch_size)
    models['simple_mlp'] = mlp
    histories['simple_mlp'] = hist

    ultra = build_ultra_simple_model(input_shape)
    hist = train_with_validation(ultra, X_tr, y_tr, X_val, y_val, epochs=epochs, batch_size=batch_size)
    models['ultra_simple'] = ultra
    histories['ultra_simple'] = hist

    return models, histories


def predict_on_grid_residual(models, T, lf_y_all, lf_q_all, r_min, r_max, y_dense=None):
    if y_dense is None:
        y_dense = np.linspace(0.4, 2.1, 50)
    lf_q_dense = sabr_implied_vol(y_dense, T)

    lf_q_all_norm = normalize_data(lf_q_all, r_min, r_max)

    predictions = {}
    for name, model in models.items():
        pred_list = []
        for y_pt in y_dense:
            lf_q_pt = sabr_implied_vol(y_pt, T)
            lf_q_pt_norm = normalize_data(lf_q_pt, r_min, r_max)
            table = compile_input_table(y_pt, lf_q_pt_norm, lf_y_all, lf_q_all_norm)
            inp = np.expand_dims(table, axis=0)
            res_norm = model.predict(inp, verbose=0)[0,0]
            res = denormalize_data(np.array([res_norm]), r_min, r_max)[0]
            pred_hf = lf_q_pt + res
            pred_list.append(pred_hf)
        predictions[name] = np.array(pred_list)

    return y_dense, lf_q_dense, predictions



In [None]:
## 8 Train and Compare on T=3 and T=20 (legacy arrays for LF/HF)

results_by_T = {}

for T_val in [3, 20]:
    data_T = generate_from_legacy(T=T_val, n_l_samples=40, n_h_samples=5, use_even_indices=True)
    models_T, histories_T = train_models_for_dataset(
        data_T['input_shape'], data_T['X_train'], data_T['y_train'], epochs=50, batch_size=2
    )

    # Dense grid on Q_x to align with legacy arrays
    y_dense = Q_x
    lf_q_dense = data_T['lf_cs'](y_dense)
    hf_q_dense_true = data_T['hf_cs'](y_dense)

    # Predictions
    preds = {}
    lf_q_all_norm = normalize_data(data_T['lf_q_all'], data_T['r_min'], data_T['r_max'])
    for name, model in models_T.items():
        pred_list = []
        for y_pt in y_dense:
            lf_q_pt = data_T['lf_cs'](y_pt)
            lf_q_pt_norm = normalize_data(lf_q_pt, data_T['r_min'], data_T['r_max'])
            table = compile_input_table(y_pt, lf_q_pt_norm, data_T['lf_y_all'], lf_q_all_norm)
            inp = np.expand_dims(table, axis=0)
            res_norm = model.predict(inp, verbose=0)[0,0]
            res = denormalize_data(np.array([res_norm]), data_T['r_min'], data_T['r_max'])[0]
            pred_hf = lf_q_pt + res
            pred_list.append(pred_hf)
        preds[name] = np.array(pred_list)

    # Cubic spline baseline through HF training points
    cs = CubicSpline(data_T['hf_y_train'], data_T['hf_q_train'])
    spline_preds = cs(y_dense)

    # Metrics
    metrics = {}
    for name, pred in preds.items():
        rmse = float(np.sqrt(np.mean((pred - hf_q_dense_true)**2)))
        mae = float(np.mean(np.abs(pred - hf_q_dense_true)))
        metrics[name] = {'params': models_T[name].count_params(), 'rmse': rmse, 'mae': mae}
    metrics['cubic_spline'] = {'params': 0,
                               'rmse': float(np.sqrt(np.mean((spline_preds - hf_q_dense_true)**2))),
                               'mae': float(np.mean(np.abs(spline_preds - hf_q_dense_true)))}

    results_by_T[T_val] = {
        'y_dense': y_dense,
        'lf_q_dense': lf_q_dense,
        'hf_q_dense_true': hf_q_dense_true,
        'preds': preds,
        'spline': spline_preds,
        'metrics': metrics,
        'hf_y_train': data_T['hf_y_train'],
        'hf_q_train': data_T['hf_q_train'],
        'lf_at_hf': data_T['lf_at_hf'],
        'histories': histories_T,
    }

print("Finished training/eval using legacy arrays for LF and HF.")


In [None]:
## 9 Plot losses using stored histories (no retraining)

for T_val, res in results_by_T.items():
    histories = res.get('histories', {})
    if not histories:
        continue
    fig, ax = plt.subplots(1, 1, figsize=(8, 4.8))
    for name, hist in histories.items():
        ax.plot(hist.history.get('loss', []), label=f"{name} - train")
        ax.plot(hist.history.get('val_loss', []), linestyle='--', label=f"{name} - val")
    ax.set_title(f"Training and Validation Loss (T={T_val})")
    ax.set_xlabel('Epoch')
    ax.set_ylabel('MSE Loss')
    ax.grid(True, alpha=0.35)
    ax.legend()
    plt.tight_layout()
    plt.show()



In [None]:
## 10 Visualization for Both Maturities and Correct Residuals

for T_val, res in results_by_T.items():
    y_dense = res['y_dense']
    lf_q_dense = res['lf_q_dense']
    hf_true = res['hf_q_dense_true']
    preds = res['preds']
    spline_preds = res['spline']

    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f"Comparison at T={T_val} years")

    # Plot 1: Predictions vs True
    ax1 = axes[0,0]
    ax1.plot(y_dense, lf_q_dense, 'b:', label='Low-Fidelity (LF)', linewidth=2)
    ax1.plot(y_dense, hf_true, 'k--', label='True High-Fidelity (HF)', linewidth=2)
    for name, pred in preds.items():
        ax1.plot(y_dense, pred, label=name.replace('_',' ').title())
    ax1.plot(y_dense, spline_preds, 'brown', label='Cubic Spline', linestyle='-.')
    ax1.set_xlabel('Input Parameter y'); ax1.set_ylabel('Q (volatility)'); ax1.legend(); ax1.grid(True, alpha=0.3)

    # Plot 2: Error (Prediction - True)
    ax2 = axes[0,1]
    for name, pred in preds.items():
        ax2.plot(y_dense, 100*(pred - hf_true), label=f"{name.replace('_',' ').title()} Error")
    ax2.plot(y_dense, 100*(spline_preds - hf_true), 'brown', label='Cubic Spline Error', linestyle='-.')
    ax2.axhline(0, color='gray', linestyle='--', alpha=0.6)
    ax2.set_xlabel('Input Parameter y'); ax2.set_ylabel('Error (%)'); ax2.grid(True, alpha=0.3); ax2.legend()

    # Plot 3: Training points and residuals (correct)
    ax3 = axes[1,0]
    ax3.plot(y_dense, lf_q_dense, 'b:', label='LF', linewidth=2)
    ax3.plot(y_dense, hf_true, 'k--', label='True HF', linewidth=2)
    ax3.scatter(res['hf_y_train'], res['hf_q_train'], c='r', s=120, label='HF Train')
    ax3.scatter(res['hf_y_train'], res['lf_at_hf'], c='blue', marker='s', s=120, label='LF at HF')
    # Residual (HF-LF) points removed to avoid zooming the plot
    ax3.set_xlabel('Input Parameter y'); ax3.set_ylabel('Q (volatility)'); ax3.legend(); ax3.grid(True, alpha=0.3)

    # Plot 4: Params
    ax4 = axes[1,1]
    model_names = list(preds.keys()) + ['Cubic Spline']
    param_counts = [results_by_T[T_val]['metrics'][k]['params'] for k in preds.keys()] + [0]
    bars = ax4.bar(model_names, param_counts)
    ax4.set_title('Model Complexity'); ax4.set_ylabel('Parameter Count'); ax4.tick_params(axis='x', rotation=45)
    for b, v in zip(bars, param_counts):
        ax4.text(b.get_x()+b.get_width()/2, v + max(param_counts)*0.01 + 1e-9, f"{v}", ha='center', va='bottom')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    plt.show()

# Print compact metrics
for T_val, res in results_by_T.items():
    print(f"\nMetrics at T={T_val}:")
    for name, m in res['metrics'].items():
        print(f"  {name:14s} params={m['params']:4d}  RMSE={m['rmse']:.6f}  MAE={m['mae']:.6f}")

