In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.optimize import minimize
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

In [None]:
# Cell 2: Load and Explore Data
# Load the Excel file
df = pd.read_excel('FacoDMEK.xlsx', sheet_name='Cleaned Data')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Cell 3: Data Preprocessing - Using Keratometry Values
# Calculate average keratometry from KERATOMETRY (not biometry)
df['K_avg_Kerato'] = (df['Keratometric Ks'] + df['Keratometric Kf']) / 2

# Calculate the "true" IOL power that would have achieved emmetropia
df['True_IOL'] = df['IOL Power'] - df['PostOP Spherical Equivalent']

# Feature engineering for keratometry-based analysis
df['K_Astigmatism_Kerato'] = df['Keratometric Ks'] - df['Keratometric Kf']
df['Post_Ant_Ratio'] = df['Posterior Km'] / df['Anterior Km']

# NOTE: We'll apply RobustScaler later, so no manual normalization here

# Display summary statistics
print("Summary of key variables:")
print(df[['Bio-AL', 'K_avg_Kerato', 'IOL Power', 'PostOP Spherical Equivalent', 
          'True_IOL', 'CCT', 'Posterior Km']].describe())

# Check for missing values
print("\nMissing values in key columns:")
missing_counts = df[['Bio-AL', 'K_avg_Kerato', 'IOL Power', 
                     'PostOP Spherical Equivalent', 'True_IOL', 'CCT', 
                     'A-Constant', 'Posterior Km']].isnull().sum()
print(missing_counts)

# Count complete cases
complete_cases = df[['Bio-AL', 'K_avg_Kerato', 'IOL Power', 
                     'PostOP Spherical Equivalent', 'A-Constant']].notna().all(axis=1).sum()
print(f"\nComplete cases for analysis: {complete_cases} out of {len(df)}")

In [None]:
# Cell 4: Implement SRK/T Formula with Keratometry K Values
def calculate_SRKT(AL, K, A_const, nc=1.333):
    """
    Calculate IOL power using SRK/T formula
    Uses keratometry K values
    Returns NaN if inputs are invalid
    """
    # Check for valid inputs
    if pd.isna(AL) or pd.isna(K) or pd.isna(A_const) or K <= 0 or AL <= 0:
        return np.nan
    
    try:
        # Constants
        na = 1.336
        V = 12
        
        # Corneal radius from keratometry K
        r = 337.5 / K
        
        # Axial length correction - USING NEGATIVE 3.446 AS PER EXCEL FORMULA
        if AL > 24.2:
            LCOR = -3.446 + 1.716 * AL - 0.0237 * AL**2
        else:
            LCOR = AL
        
        # Corneal width
        Cw = -5.41 + 0.58412 * LCOR + 0.098 * K
        
        # Check if we can calculate H (avoid negative square root)
        if r**2 - (Cw**2 / 4) < 0:
            return np.nan
        
        # Corneal height
        H = r - np.sqrt(r**2 - (Cw**2 / 4))
        
        # ACD constant from A-constant
        ACDconst = 0.62467 * A_const - 68.747
        
        # Offset
        offset = ACDconst - 3.336
        
        # Estimated postoperative ACD
        ACDest = H + offset
        
        # Retinal thickness correction
        RETHICK = 0.65696 - 0.02029 * AL
        LOPT = AL + RETHICK
        
        # Calculate IOL power for emmetropia
        ncm1 = nc - 1
        IOL = (1000 * na * (na * r - ncm1 * LOPT)) / ((LOPT - ACDest) * (na * r - ncm1 * ACDest))
        
        return IOL
    except:
        return np.nan

# Calculate SRK/T predictions using keratometry K values
df['SRKT_Prediction'] = df.apply(
    lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Kerato'], row['A-Constant']), 
    axis=1
)
df['SRKT_Error'] = df['SRKT_Prediction'] - df['True_IOL']

# Remove rows with NaN errors for analysis
valid_cases = df['SRKT_Error'].notna()

print("SRK/T with Keratometry K Values:")
print(f"Valid predictions: {valid_cases.sum()} out of {len(df)}")
if valid_cases.sum() > 0:
    print(f"Mean Error: {df.loc[valid_cases, 'SRKT_Error'].mean():.3f} D")
    print(f"Mean Absolute Error: {df.loc[valid_cases, 'SRKT_Error'].abs().mean():.3f} D")
    print(f"Standard Deviation: {df.loc[valid_cases, 'SRKT_Error'].std():.3f} D")

In [None]:
# Cell 5: Visualize SRK/T Performance with Keratometry
# Only plot for rows with valid data
df_valid = df[df['SRKT_Error'].notna()].copy()

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: Error Distribution
if len(df_valid) > 0:
    axes[0, 0].hist(df_valid['SRKT_Error'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(x=0, color='red', linestyle='--')
    axes[0, 0].set_xlabel('SRK/T Prediction Error (D)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title(f'Error Distribution (n={len(df_valid)})')

# Plot 2: Error vs Axial Length
    axes[0, 1].scatter(df_valid['Bio-AL'], df_valid['SRKT_Error'], alpha=0.6)
    axes[0, 1].set_xlabel('Axial Length (mm)')
    axes[0, 1].set_ylabel('SRK/T Error (D)')
    axes[0, 1].set_title('Error vs Axial Length')
    axes[0, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# Plot 3: Error vs Posterior Corneal Power
    axes[1, 0].scatter(df_valid['Posterior Km'], df_valid['SRKT_Error'], alpha=0.6)
    axes[1, 0].set_xlabel('Posterior Corneal Power (D)')
    axes[1, 0].set_ylabel('SRK/T Error (D)')
    axes[1, 0].set_title('Error vs Posterior K')
    axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.5)

# Plot 4: Error vs CCT
    valid_cct = df_valid[df_valid['CCT'].notna()]
    axes[1, 1].scatter(valid_cct['CCT'], valid_cct['SRKT_Error'], alpha=0.6)
    axes[1, 1].set_xlabel('Central Corneal Thickness (μm)')
    axes[1, 1].set_ylabel('SRK/T Error (D)')
    axes[1, 1].set_title('Error vs CCT')
    axes[1, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

# Calculate percentage within target ranges
if len(df_valid) > 0:
    within_025 = (df_valid['SRKT_Error'].abs() <= 0.25).sum() / len(df_valid) * 100
    within_050 = (df_valid['SRKT_Error'].abs() <= 0.50).sum() / len(df_valid) * 100
    within_100 = (df_valid['SRKT_Error'].abs() <= 1.00).sum() / len(df_valid) * 100
    
    print(f"\nPercentage of eyes within target:")
    print(f"±0.25 D: {within_025:.1f}%")
    print(f"±0.50 D: {within_050:.1f}%")
    print(f"±1.00 D: {within_100:.1f}%")

In [None]:
# Cell 6: Optimization Approach 1 - Optimize Corneal Refractive Index
# Only use complete cases
df_complete = df[df['SRKT_Error'].notna()].copy()

def objective_nc(nc_value):
    """Objective function to minimize MAE by optimizing nc"""
    predictions = df_complete.apply(
        lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Kerato'], row['A-Constant'], nc=nc_value[0]), 
        axis=1
    )
    errors = predictions - df_complete['True_IOL']
    # Remove any NaN values that might occur
    valid_errors = errors[errors.notna()]
    if len(valid_errors) == 0:
        return 999  # Return large value if no valid predictions
    return np.mean(np.abs(valid_errors))

# Optimize nc
if len(df_complete) > 0:
    result_nc = minimize(objective_nc, x0=[1.333], bounds=[(1.330, 1.340)], method='L-BFGS-B')
    optimal_nc = result_nc.x[0]
    
    # Recalculate with optimal nc
    df_complete['SRKT_Optimized_nc'] = df_complete.apply(
        lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Kerato'], row['A-Constant'], nc=optimal_nc), 
        axis=1
    )
    df_complete['SRKT_Error_Optimized_nc'] = df_complete['SRKT_Optimized_nc'] - df_complete['True_IOL']
    
    print(f"Optimal corneal refractive index: {optimal_nc:.4f}")
    print(f"Original MAE: {df_complete['SRKT_Error'].abs().mean():.3f} D")
    print(f"Optimized MAE: {df_complete['SRKT_Error_Optimized_nc'].abs().mean():.3f} D")
else:
    print("Not enough complete cases for optimization")

In [None]:
# Cell 7: Linear Correction Model with RobustScaler
# Prepare features for correction model
feature_cols = ['Posterior Km', 'CCT', 'Post_Ant_Ratio', 'K_Astigmatism_Kerato']

# Create a dataset with only complete cases
df_ml = df.copy()

# Ensure we have all necessary columns
required_cols = feature_cols + ['SRKT_Error']
df_ml = df_ml[df_ml[required_cols].notna().all(axis=1)].copy()

print(f"Complete cases for ML: {len(df_ml)} out of {len(df)}")

if len(df_ml) > 10:  # Need at least 10 cases for meaningful analysis
    X = df_ml[feature_cols]
    y = df_ml['SRKT_Error']
    
    # Initialize RobustScaler
    scaler = RobustScaler()
    
    # Split data for cross-validation
    loo = LeaveOneOut()
    predictions_linear = []
    true_values = []
    
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Scale features using RobustScaler
        scaler_cv = RobustScaler()
        X_train_scaled = scaler_cv.fit_transform(X_train)
        X_test_scaled = scaler_cv.transform(X_test)
        
        # Train linear model
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
        
        # Predict correction
        correction = model.predict(X_test_scaled)[0]
        
        # Apply correction to SRK/T prediction
        srkt_pred = df_ml.iloc[test_index]['SRKT_Prediction'].values[0]
        corrected_pred = srkt_pred - correction
        
        predictions_linear.append(corrected_pred)
        true_values.append(df_ml.iloc[test_index]['True_IOL'].values[0])
    
    # Calculate performance
    mae_original = df_ml['SRKT_Error'].abs().mean()
    mae_linear = mean_absolute_error(true_values, predictions_linear)
    
    print(f"Original SRK/T MAE: {mae_original:.3f} D")
    print(f"Linear Correction Model MAE: {mae_linear:.3f} D")
    print(f"Improvement: {mae_original - mae_linear:.3f} D")
    
    # Show feature importance with scaled data
    X_scaled_full = scaler.fit_transform(X)
    model_full = LinearRegression()
    model_full.fit(X_scaled_full, y)
    
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': model_full.coef_,
        'Abs_Coefficient': np.abs(model_full.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print("\nFeature Importance (Linear Model with RobustScaler):")
    print(feature_importance)
else:
    print("Not enough complete cases for machine learning analysis")

In [None]:
# Cell 8: Machine Learning Models Comparison with RobustScaler
if len(df_ml) > 10:
    # Prepare all keratometry-based features
    features_ml = ['Bio-AL', 'K_avg_Kerato', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Kerato', 'A-Constant']
    
    # Ensure complete cases
    df_ml_full = df_ml[features_ml + ['True_IOL']].dropna()
    X_ml = df_ml_full[features_ml]
    y_ml = df_ml_full['True_IOL']
    
    print(f"Cases for ML comparison: {len(X_ml)}")
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    }
    
    # Cross-validation results with RobustScaler
    results = {}
    
    for name, model in models.items():
        predictions = []
        actuals = []
        
        for train_idx, test_idx in loo.split(X_ml):
            X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
            y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
            
            # Apply RobustScaler
            scaler = RobustScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)[0]
            
            predictions.append(pred)
            actuals.append(y_test.values[0])
        
        mae = mean_absolute_error(actuals, predictions)
        results[name] = mae
        print(f"{name} MAE: {mae:.3f} D")
    
    # Compare with original SRK/T
    original_mae = df_ml_full.merge(df[['SRKT_Error']], left_index=True, right_index=True)['SRKT_Error'].abs().mean()
    print(f"\nOriginal SRK/T MAE: {original_mae:.3f} D")
    print(f"Best ML Model: {min(results, key=results.get)} with MAE: {min(results.values()):.3f} D")

In [None]:
# Cell 9: Create Final Optimized Formula for FacoDMEK with RobustScaler
if len(df_ml) > 10:
    # Prepare and scale all data
    scaler_final = RobustScaler()
    X_ml_scaled = scaler_final.fit_transform(X_ml)
    
    # Train final model on all scaled data
    best_model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    best_model.fit(X_ml_scaled, y_ml)
    
    # Feature importance
    feature_importance_gb = pd.DataFrame({
        'Feature': features_ml,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_gb['Feature'], feature_importance_gb['Importance'])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance in IOL Power Prediction Model (with RobustScaler)')
    plt.tight_layout()
    plt.show()
    
    print("Feature Importance:")
    print(feature_importance_gb)
    
    # Create simplified correction formula based on key features
    top_features = ['Posterior Km', 'CCT', 'Bio-AL']
    
    # Ensure we have complete data for these features
    df_simple = df_ml[top_features + ['True_IOL', 'SRKT_Prediction']].dropna()
    X_simple = df_simple[top_features]
    y_correction = df_simple['True_IOL'] - df_simple['SRKT_Prediction']
    
    # Scale the features
    scaler_simple = RobustScaler()
    X_simple_scaled = scaler_simple.fit_transform(X_simple)
    
    model_simple = LinearRegression()
    model_simple.fit(X_simple_scaled, y_correction)
    
    # Get the scaling parameters for clinical use
    print(f"\nRobustScaler Parameters (for clinical implementation):")
    print(f"Medians: {scaler_simple.center_}")
    print(f"IQRs: {scaler_simple.scale_}")
    
    print(f"\nFacoDMEK Correction Formula (with RobustScaler):")
    print(f"Step 1: Scale features using RobustScaler")
    for i, feat in enumerate(top_features):
        print(f"  {feat}_scaled = ({feat} - {scaler_simple.center_[i]:.3f}) / {scaler_simple.scale_[i]:.3f}")
    
    print(f"\nStep 2: Calculate correction")
    print(f"Correction = {model_simple.intercept_:.3f}")
    for feat, coef in zip(top_features, model_simple.coef_):
        print(f"           + {coef:.4f} × {feat}_scaled")
    
    # Calculate final performance
    df_simple['Predicted_Correction'] = model_simple.predict(X_simple_scaled)
    df_simple['Final_Prediction'] = df_simple['SRKT_Prediction'] + df_simple['Predicted_Correction']
    final_mae = mean_absolute_error(df_simple['True_IOL'], df_simple['Final_Prediction'])
    
    print(f"\nFinal formula MAE: {final_mae:.3f} D")

In [None]:
# Cell 10: Final Validation and Results Summary
if len(df_ml) > 10:
    # Summary statistics
    print("FINAL RESULTS SUMMARY - KERATOMETRY-BASED ANALYSIS WITH ROBUSTSCALER")
    print("="*60)
    print(f"Original data points: {len(df)}")
    print(f"Complete cases for analysis: {len(df_ml)}")
    print(f"\nOriginal SRK/T Performance (Keratometry K):")
    print(f"  MAE: {df_ml['SRKT_Error'].abs().mean():.3f} D")
    print(f"  Mean Error: {df_ml['SRKT_Error'].mean():.3f} D")
    print(f"  STD: {df_ml['SRKT_Error'].std():.3f} D")
    
    if 'optimal_nc' in locals():
        print(f"\nOptimized corneal refractive index: {optimal_nc:.4f}")
    
    print("\n" + "="*60)
    print("RECOMMENDED FACODMEK FORMULA (WITH ROBUSTSCALER):")
    print("="*60)
    print("Modified_IOL = SRK/T_Keratometry + Correction")
    print("\nWhere Correction is calculated after RobustScaler normalization")
    
    # Create a practical clinical formula card
    print("\n" + "="*60)
    print("CLINICAL IMPLEMENTATION:")
    print("="*60)
    print("1. Calculate standard SRK/T using keratometry K")
    print("2. Normalize features using RobustScaler parameters from your population:")
    for i, feat in enumerate(top_features):
        print(f"   {feat}_scaled = ({feat} - {scaler_simple.center_[i]:.2f}) / {scaler_simple.scale_[i]:.2f}")
    print(f"3. Correction = {model_simple.intercept_:.3f} + "
          f"{model_simple.coef_[0]:.3f}×PostK_scaled + "
          f"{model_simple.coef_[1]:.4f}×CCT_scaled + "
          f"{model_simple.coef_[2]:.3f}×AL_scaled")
    print("4. Modified IOL = SRK/T + Correction")
    
    print("\nNote: RobustScaler ensures robustness to outliers common in FacoDMEK eyes")
    
    # Save results with scaling parameters
    results_df = df_ml[['ID', 'Patient', 'Eye', 'Bio-AL', 'K_avg_Kerato', 'IOL Power', 
                        'PostOP Spherical Equivalent', 'True_IOL', 'SRKT_Prediction', 
                        'SRKT_Error', 'Posterior Km', 'CCT']].copy()
    
    # Add scaling parameters to the Excel file
    scaling_params_df = pd.DataFrame({
        'Feature': top_features,
        'Median': scaler_simple.center_,
        'IQR': scaler_simple.scale_
    })
    
    # Save both dataframes to Excel
    with pd.ExcelWriter('FacoDMEK_RobustScaler_Results.xlsx') as writer:
        results_df.to_excel(writer, sheet_name='Results', index=False)
        scaling_params_df.to_excel(writer, sheet_name='Scaling_Parameters', index=False)
    
    print("\nResults saved to 'FacoDMEK_RobustScaler_Results.xlsx'")
    
    # Create visualization of improvement
    plt.figure(figsize=(10, 6))
    
    # Bland-Altman plot
    mean_values = (df_simple['True_IOL'] + df_simple['Final_Prediction']) / 2
    diff_values = df_simple['Final_Prediction'] - df_simple['True_IOL']
    
    plt.scatter(mean_values, diff_values, alpha=0.6)
    plt.axhline(y=0, color='red', linestyle='-', linewidth=2)
    plt.axhline(y=diff_values.mean(), color='blue', linestyle='--')
    plt.axhline(y=diff_values.mean() + 1.96*diff_values.std(), color='blue', linestyle=':')
    plt.axhline(y=diff_values.mean() - 1.96*diff_values.std(), color='blue', linestyle=':')
    
    plt.xlabel('Mean of True and Predicted IOL (D)')
    plt.ylabel('Predicted - True IOL (D)')
    plt.title('Bland-Altman Plot: FacoDMEK Formula Performance (with RobustScaler)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
else:
    print("Insufficient data for complete analysis")


In [None]:
# Cell 11: Extract Formula from Ridge Regression Model with RobustScaler
if len(df_ml) > 10:
    # Prepare data
    df_ridge = df_ml[features_ml + ['True_IOL', 'SRKT_Prediction']].dropna()
    X_ridge = df_ridge[features_ml]
    
    # Calculate the correction needed (True IOL - SRK/T prediction)
    y_correction = df_ridge['True_IOL'] - df_ridge['SRKT_Prediction']
    
    # Apply RobustScaler
    scaler_ridge = RobustScaler()
    X_ridge_scaled = scaler_ridge.fit_transform(X_ridge)
    
    # Train Ridge Regression for correction
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_ridge_scaled, y_correction)
    
    # Extract coefficients
    print("RIDGE REGRESSION CORRECTION FORMULA (WITH ROBUSTSCALER):")
    print("="*60)
    print(f"Correction = {ridge_model.intercept_:.4f}")
    
    # Create a coefficient table
    coef_df = pd.DataFrame({
        'Feature': features_ml,
        'Coefficient': ridge_model.coef_,
        'Abs_Coefficient': np.abs(ridge_model.coef_),
        'Median': scaler_ridge.center_,
        'IQR': scaler_ridge.scale_
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print("\nCoefficients and Scaling Parameters:")
    print(coef_df)
    
    # Create the practical formula
    print("\n" + "="*60)
    print("PRACTICAL FACODMEK FORMULA FROM RIDGE REGRESSION:")
    print("="*60)
    print("Modified_IOL = Standard_SRK/T + Correction")
    print(f"\nWhere Correction = {ridge_model.intercept_:.3f}")
    
    print("\nAfter scaling each feature:")
    for i, (feat, coef) in enumerate(zip(features_ml, ridge_model.coef_)):
        if abs(coef) > 0.001:  # Only show meaningful coefficients
            print(f"    + ({coef:.4f} × [{feat} - {scaler_ridge.center_[i]:.2f}] / {scaler_ridge.scale_[i]:.2f})")
    
    # Simplified version using only the most important features
    print("\n" + "="*60)
    print("SIMPLIFIED FORMULA (Top 3 Features):")
    print("="*60)
    
    # Get top 3 features by absolute coefficient value
    top_3_indices = coef_df.nlargest(3, 'Abs_Coefficient').index
    top_3_features = [features_ml[i] for i in top_3_indices]
    top_3_coefs = [ridge_model.coef_[i] for i in top_3_indices]
    
    # Re-fit with only top 3 features for a cleaner formula
    X_simple_ridge = df_ridge[top_3_features]
    scaler_simple_ridge = RobustScaler()
    X_simple_ridge_scaled = scaler_simple_ridge.fit_transform(X_simple_ridge)
    
    ridge_simple = Ridge(alpha=1.0)
    ridge_simple.fit(X_simple_ridge_scaled, y_correction)
    
    print("Modified_IOL = Standard_SRK/T + Correction")
    print(f"\nWhere Correction = {ridge_simple.intercept_:.3f}")
    
    print("\nScaling parameters for clinical use:")
    for i, feat in enumerate(top_3_features):
        print(f"{feat}: median={scaler_simple_ridge.center_[i]:.2f}, IQR={scaler_simple_ridge.scale_[i]:.2f}")
    
    print("\nFinal correction formula:")
    for i, (feat, coef) in enumerate(zip(top_3_features, ridge_simple.coef_)):
        print(f"    + ({coef:.4f} × [{feat} - {scaler_simple_ridge.center_[i]:.2f}] / {scaler_simple_ridge.scale_[i]:.2f})")
    
    # Validate the simplified formula
    y_pred_simple = ridge_simple.predict(X_simple_ridge_scaled)
    final_iol_simple = df_ridge['SRKT_Prediction'] + y_pred_simple
    mae_simple = mean_absolute_error(df_ridge['True_IOL'], final_iol_simple)
    
    print(f"\nSimplified formula MAE: {mae_simple:.3f} D")
    
    # Create a clinically usable formula card
    print("\n" + "="*60)
    print("CLINICAL FORMULA CARD - FACODMEK IOL CALCULATION")
    print("="*60)
    print("Step 1: Calculate standard SRK/T using keratometry K")
    print("Step 2: Normalize features using population-specific parameters")
    print("Step 3: Apply correction formula")
    print("Step 4: Modified IOL = SRK/T + Correction")
    print("\nIMPORTANT: RobustScaler parameters must be derived from YOUR population")

In [None]:
# Cell 12: Advanced Machine Learning Models for Better Performance
# Add this cell after Cell 11 in your notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                            ExtraTreesRegressor, VotingRegressor, StackingRegressor)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern
from sklearn.linear_model import (ElasticNet, HuberRegressor, RANSACRegressor, 
                                TheilSenRegressor, BayesianRidge, ARDRegression)
import xgboost as xgb
import lightgbm as lgb
from sklearn.kernel_ridge import KernelRidge
import warnings
warnings.filterwarnings('ignore')

# Ensure we have the data ready
if len(df_ml) > 10:
    # Prepare features
    features_ml = ['Bio-AL', 'K_avg_Kerato', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Kerato', 'A-Constant']
    
    # Get complete cases
    df_ml_full = df_ml[features_ml + ['True_IOL']].dropna()
    X_ml = df_ml_full[features_ml]
    y_ml = df_ml_full['True_IOL']
    
    print(f"Cases for advanced ML analysis: {len(X_ml)}")
    print("="*80)
    
    # Initialize LOO cross-validation
    loo = LeaveOneOut()
    
    # Dictionary to store results
    advanced_results = {}
    
    # 1. FEATURE ENGINEERING - Create polynomial and interaction features
    print("1. Testing with Polynomial Features...")
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly_features.fit_transform(X_ml)
    
    # Ridge with polynomial features
    ridge_poly = Ridge(alpha=1.0)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_poly):
        X_train, X_test = X_poly[train_idx], X_poly[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        ridge_poly.fit(X_train, y_train)
        predictions.append(ridge_poly.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_poly = mean_absolute_error(actuals, predictions)
    advanced_results['Ridge + Polynomial Features'] = mae_poly
    print(f"Ridge + Polynomial Features MAE: {mae_poly:.3f} D")
    
    # 2. ROBUST REGRESSION METHODS
    print("\n2. Testing Robust Regression Methods...")
    
    # Huber Regressor - robust to outliers
    huber = HuberRegressor(epsilon=1.35)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        huber.fit(X_train, y_train)
        predictions.append(huber.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_huber = mean_absolute_error(actuals, predictions)
    advanced_results['Huber Regressor'] = mae_huber
    print(f"Huber Regressor MAE: {mae_huber:.3f} D")
    
    # RANSAC - removes outliers automatically
    ransac = RANSACRegressor(random_state=42)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        ransac.fit(X_train, y_train)
        predictions.append(ransac.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_ransac = mean_absolute_error(actuals, predictions)
    advanced_results['RANSAC'] = mae_ransac
    print(f"RANSAC MAE: {mae_ransac:.3f} D")
    
    # 3. BAYESIAN METHODS
    print("\n3. Testing Bayesian Methods...")
    
    # Bayesian Ridge
    bayesian_ridge = BayesianRidge()
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        bayesian_ridge.fit(X_train, y_train)
        predictions.append(bayesian_ridge.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_bayesian = mean_absolute_error(actuals, predictions)
    advanced_results['Bayesian Ridge'] = mae_bayesian
    print(f"Bayesian Ridge MAE: {mae_bayesian:.3f} D")
    
    # 4. SUPPORT VECTOR REGRESSION with different kernels
    print("\n4. Testing Support Vector Regression...")
    
    # Scale features for SVR
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_ml)
    
    # SVR with RBF kernel
    svr_rbf = SVR(kernel='rbf', C=10, gamma='scale')
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        svr_rbf.fit(X_train, y_train)
        predictions.append(svr_rbf.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_svr_rbf = mean_absolute_error(actuals, predictions)
    advanced_results['SVR (RBF)'] = mae_svr_rbf
    print(f"SVR (RBF kernel) MAE: {mae_svr_rbf:.3f} D")
    
    # 5. GRADIENT BOOSTING VARIANTS
    print("\n5. Testing Advanced Gradient Boosting...")
    
    # XGBoost
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        xgb_model.fit(X_train, y_train)
        predictions.append(xgb_model.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_xgb = mean_absolute_error(actuals, predictions)
    advanced_results['XGBoost'] = mae_xgb
    print(f"XGBoost MAE: {mae_xgb:.3f} D")
    
    # LightGBM
    lgb_model = lgb.LGBMRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        num_leaves=31,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        lgb_model.fit(X_train, y_train)
        predictions.append(lgb_model.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_lgb = mean_absolute_error(actuals, predictions)
    advanced_results['LightGBM'] = mae_lgb
    print(f"LightGBM MAE: {mae_lgb:.3f} D")
    
    # 6. NEURAL NETWORK
    print("\n6. Testing Neural Network...")
    
    mlp = MLPRegressor(
        hidden_layer_sizes=(50, 30),
        activation='relu',
        solver='adam',
        alpha=0.01,
        max_iter=1000,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        mlp.fit(X_train, y_train)
        predictions.append(mlp.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_mlp = mean_absolute_error(actuals, predictions)
    advanced_results['Neural Network'] = mae_mlp
    print(f"Neural Network MAE: {mae_mlp:.3f} D")
    
    # 7. ENSEMBLE METHODS
    print("\n7. Testing Ensemble Methods...")
    
    # Voting Regressor - combines multiple models
    voting_regressor = VotingRegressor([
        ('ridge', Ridge(alpha=1.0)),
        ('huber', HuberRegressor()),
        ('gb', GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42))
    ])
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        voting_regressor.fit(X_train, y_train)
        predictions.append(voting_regressor.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_voting = mean_absolute_error(actuals, predictions)
    advanced_results['Voting Ensemble'] = mae_voting
    print(f"Voting Ensemble MAE: {mae_voting:.3f} D")
    
    # 8. GAUSSIAN PROCESS REGRESSION
    print("\n8. Testing Gaussian Process Regression...")
    
    kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1e-2)
    gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, random_state=42)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        gpr.fit(X_train, y_train)
        predictions.append(gpr.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_gpr = mean_absolute_error(actuals, predictions)
    advanced_results['Gaussian Process'] = mae_gpr
    print(f"Gaussian Process MAE: {mae_gpr:.3f} D")
    
    # 9. ELASTIC NET with optimization
    print("\n9. Testing Elastic Net with optimization...")
    
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        elastic_net.fit(X_train, y_train)
        predictions.append(elastic_net.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_elastic = mean_absolute_error(actuals, predictions)
    advanced_results['Elastic Net'] = mae_elastic
    print(f"Elastic Net MAE: {mae_elastic:.3f} D")
    
    # 10. KERNEL RIDGE REGRESSION
    print("\n10. Testing Kernel Ridge Regression...")
    
    kr = KernelRidge(alpha=1.0, kernel='rbf', gamma=0.1)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        kr.fit(X_train, y_train)
        predictions.append(kr.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_kr = mean_absolute_error(actuals, predictions)
    advanced_results['Kernel Ridge'] = mae_kr
    print(f"Kernel Ridge MAE: {mae_kr:.3f} D")
    
    # RESULTS SUMMARY
    print("\n" + "="*80)
    print("ADVANCED MODEL RESULTS SUMMARY")
    print("="*80)
    
    # Sort results by MAE
    sorted_results = sorted(advanced_results.items(), key=lambda x: x[1])
    
    print("\nAll models ranked by performance:")
    for i, (model, mae) in enumerate(sorted_results, 1):
        print(f"{i}. {model}: {mae:.3f} D")
    
    best_model_name = sorted_results[0][0]
    best_mae = sorted_results[0][1]
    
    print(f"\nBEST MODEL: {best_model_name} with MAE: {best_mae:.3f} D")
    print(f"Improvement over Ridge: {1.455 - best_mae:.3f} D")
    
    # Create visualization
    import matplotlib.pyplot as plt
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot of all models
    models = [x[0] for x in sorted_results]
    maes = [x[1] for x in sorted_results]
    colors = ['green' if mae < 1.455 else 'red' for mae in maes]
    
    ax1.barh(models, maes, color=colors)
    ax1.axvline(x=1.455, color='blue', linestyle='--', label='Original Ridge MAE')
    ax1.set_xlabel('Mean Absolute Error (D)')
    ax1.set_title('Model Performance Comparison')
    ax1.legend()
    
    # Improvement plot
    improvements = [1.455 - mae for mae in maes]
    ax2.barh(models, improvements, color=['green' if imp > 0 else 'red' for imp in improvements])
    ax2.axvline(x=0, color='black', linestyle='-')
    ax2.set_xlabel('Improvement over Ridge (D)')
    ax2.set_title('Improvement Analysis')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("Not enough data for advanced analysis")

In [None]:
# Cell 12: Advanced Machine Learning Models for Better Performance
# Add this cell after Cell 11 in your notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression, RFECV
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
                            ExtraTreesRegressor, VotingRegressor, StackingRegressor)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, Matern
from sklearn.linear_model import (ElasticNet, HuberRegressor, RANSACRegressor, 
                                TheilSenRegressor, BayesianRidge, ARDRegression)
import xgboost as xgb
import lightgbm as lgb
from sklearn.kernel_ridge import KernelRidge
import warnings
warnings.filterwarnings('ignore')

# Ensure we have the data ready
if len(df_ml) > 10:
    # Prepare features
    features_ml = ['Bio-AL', 'K_avg_Kerato', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Kerato', 'A-Constant']
    
    # Get complete cases
    df_ml_full = df_ml[features_ml + ['True_IOL']].dropna()
    X_ml = df_ml_full[features_ml]
    y_ml = df_ml_full['True_IOL']
    
    print(f"Cases for advanced ML analysis: {len(X_ml)}")
    print("="*80)
    
    # Initialize LOO cross-validation
    loo = LeaveOneOut()
    
    # Dictionary to store results
    advanced_results = {}
    
    # 1. FEATURE ENGINEERING - Create polynomial and interaction features
    print("1. Testing with Polynomial Features...")
    poly_features = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly_features.fit_transform(X_ml)
    
    # Ridge with polynomial features
    ridge_poly = Ridge(alpha=1.0)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_poly):
        X_train, X_test = X_poly[train_idx], X_poly[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        ridge_poly.fit(X_train, y_train)
        predictions.append(ridge_poly.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_poly = mean_absolute_error(actuals, predictions)
    advanced_results['Ridge + Polynomial Features'] = mae_poly
    print(f"Ridge + Polynomial Features MAE: {mae_poly:.3f} D")
    
    # 2. ROBUST REGRESSION METHODS
    print("\n2. Testing Robust Regression Methods...")
    
    # Huber Regressor - robust to outliers
    huber = HuberRegressor(epsilon=1.35)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        huber.fit(X_train, y_train)
        predictions.append(huber.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_huber = mean_absolute_error(actuals, predictions)
    advanced_results['Huber Regressor'] = mae_huber
    print(f"Huber Regressor MAE: {mae_huber:.3f} D")
    
    # RANSAC - removes outliers automatically
    ransac = RANSACRegressor(random_state=42)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        ransac.fit(X_train, y_train)
        predictions.append(ransac.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_ransac = mean_absolute_error(actuals, predictions)
    advanced_results['RANSAC'] = mae_ransac
    print(f"RANSAC MAE: {mae_ransac:.3f} D")
    
    # 3. BAYESIAN METHODS
    print("\n3. Testing Bayesian Methods...")
    
    # Bayesian Ridge
    bayesian_ridge = BayesianRidge()
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        bayesian_ridge.fit(X_train, y_train)
        predictions.append(bayesian_ridge.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_bayesian = mean_absolute_error(actuals, predictions)
    advanced_results['Bayesian Ridge'] = mae_bayesian
    print(f"Bayesian Ridge MAE: {mae_bayesian:.3f} D")
    
    # 4. SUPPORT VECTOR REGRESSION with different kernels
    print("\n4. Testing Support Vector Regression...")
    
    # Scale features for SVR
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_ml)
    
    # SVR with RBF kernel
    svr_rbf = SVR(kernel='rbf', C=10, gamma='scale')
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        svr_rbf.fit(X_train, y_train)
        predictions.append(svr_rbf.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_svr_rbf = mean_absolute_error(actuals, predictions)
    advanced_results['SVR (RBF)'] = mae_svr_rbf
    print(f"SVR (RBF kernel) MAE: {mae_svr_rbf:.3f} D")
    
    # 5. GRADIENT BOOSTING VARIANTS
    print("\n5. Testing Advanced Gradient Boosting...")
    
    # XGBoost
    xgb_model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        xgb_model.fit(X_train, y_train)
        predictions.append(xgb_model.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_xgb = mean_absolute_error(actuals, predictions)
    advanced_results['XGBoost'] = mae_xgb
    print(f"XGBoost MAE: {mae_xgb:.3f} D")
    
    # LightGBM
    lgb_model = lgb.LGBMRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        num_leaves=31,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        lgb_model.fit(X_train, y_train)
        predictions.append(lgb_model.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_lgb = mean_absolute_error(actuals, predictions)
    advanced_results['LightGBM'] = mae_lgb
    print(f"LightGBM MAE: {mae_lgb:.3f} D")
    
    # 6. NEURAL NETWORK
    print("\n6. Testing Neural Network...")
    
    mlp = MLPRegressor(
        hidden_layer_sizes=(50, 30),
        activation='relu',
        solver='adam',
        alpha=0.01,
        max_iter=1000,
        random_state=42
    )
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        mlp.fit(X_train, y_train)
        predictions.append(mlp.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_mlp = mean_absolute_error(actuals, predictions)
    advanced_results['Neural Network'] = mae_mlp
    print(f"Neural Network MAE: {mae_mlp:.3f} D")
    
    # 7. ENSEMBLE METHODS
    print("\n7. Testing Ensemble Methods...")
    
    # Voting Regressor - combines multiple models
    voting_regressor = VotingRegressor([
        ('ridge', Ridge(alpha=1.0)),
        ('huber', HuberRegressor()),
        ('gb', GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42))
    ])
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        voting_regressor.fit(X_train, y_train)
        predictions.append(voting_regressor.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_voting = mean_absolute_error(actuals, predictions)
    advanced_results['Voting Ensemble'] = mae_voting
    print(f"Voting Ensemble MAE: {mae_voting:.3f} D")
    
    # 8. GAUSSIAN PROCESS REGRESSION
    print("\n8. Testing Gaussian Process Regression...")
    
    kernel = RBF(length_scale=1.0) + WhiteKernel(noise_level=1e-2)
    gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=5, random_state=42)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        gpr.fit(X_train, y_train)
        predictions.append(gpr.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_gpr = mean_absolute_error(actuals, predictions)
    advanced_results['Gaussian Process'] = mae_gpr
    print(f"Gaussian Process MAE: {mae_gpr:.3f} D")
    
    # 9. ELASTIC NET with optimization
    print("\n9. Testing Elastic Net with optimization...")
    
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_ml):
        X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        elastic_net.fit(X_train, y_train)
        predictions.append(elastic_net.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_elastic = mean_absolute_error(actuals, predictions)
    advanced_results['Elastic Net'] = mae_elastic
    print(f"Elastic Net MAE: {mae_elastic:.3f} D")
    
    # 10. KERNEL RIDGE REGRESSION
    print("\n10. Testing Kernel Ridge Regression...")
    
    kr = KernelRidge(alpha=1.0, kernel='rbf', gamma=0.1)
    predictions = []
    actuals = []
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
        kr.fit(X_train, y_train)
        predictions.append(kr.predict(X_test)[0])
        actuals.append(y_test.values[0])
    mae_kr = mean_absolute_error(actuals, predictions)
    advanced_results['Kernel Ridge'] = mae_kr
    print(f"Kernel Ridge MAE: {mae_kr:.3f} D")
    
    # RESULTS SUMMARY
    print("\n" + "="*80)
    print("ADVANCED MODEL RESULTS SUMMARY")
    print("="*80)
    
    # Sort results by MAE
    sorted_results = sorted(advanced_results.items(), key=lambda x: x[1])
    
    print("\nAll models ranked by performance:")
    for i, (model, mae) in enumerate(sorted_results, 1):
        print(f"{i}. {model}: {mae:.3f} D")
    
    best_model_name = sorted_results[0][0]
    best_mae = sorted_results[0][1]
    
    print(f"\nBEST MODEL: {best_model_name} with MAE: {best_mae:.3f} D")
    print(f"Improvement over Ridge: {1.455 - best_mae:.3f} D")
    
    # Create visualization
    import matplotlib.pyplot as plt
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Bar plot of all models
    models = [x[0] for x in sorted_results]
    maes = [x[1] for x in sorted_results]
    colors = ['green' if mae < 1.455 else 'red' for mae in maes]
    
    ax1.barh(models, maes, color=colors)
    ax1.axvline(x=1.455, color='blue', linestyle='--', label='Original Ridge MAE')
    ax1.set_xlabel('Mean Absolute Error (D)')
    ax1.set_title('Model Performance Comparison')
    ax1.legend()
    
    # Improvement plot
    improvements = [1.455 - mae for mae in maes]
    ax2.barh(models, improvements, color=['green' if imp > 0 else 'red' for imp in improvements])
    ax2.axvline(x=0, color='black', linestyle='-')
    ax2.set_xlabel('Improvement over Ridge (D)')
    ax2.set_title('Improvement Analysis')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("Not enough data for advanced analysis")

In [None]:
# Cell 13: Feature Selection and Hyperparameter Optimization - DEBUG VERSION
# This version shows progress and runs faster

import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # For progress bars

if len(df_ml) > 10:
    # Prepare data
    features_ml = ['Bio-AL', 'K_avg_Kerato', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Kerato', 'A-Constant']
    
    df_ml_full = df_ml[features_ml + ['True_IOL']].dropna()
    X_ml = df_ml_full[features_ml]
    y_ml = df_ml_full['True_IOL']
    
    print(f"Optimizing with {len(X_ml)} cases")
    print("="*80)
    
    # 1. FEATURE IMPORTANCE ANALYSIS
    print("1. FEATURE IMPORTANCE ANALYSIS")
    print("-"*40)
    
    # Calculate feature importance using multiple methods
    # Random Forest importance
    rf_temp = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_temp.fit(X_ml, y_ml)
    rf_importance = rf_temp.feature_importances_
    
    # Mutual Information
    mi_scores = mutual_info_regression(X_ml, y_ml, random_state=42)
    
    # F-statistic
    f_scores = SelectKBest(score_func=f_regression, k='all').fit(X_ml, y_ml).scores_
    
    # Create importance dataframe
    importance_df = pd.DataFrame({
        'Feature': features_ml,
        'RF_Importance': rf_importance,
        'MI_Score': mi_scores,
        'F_Score': f_scores
    })
    
    # Normalize scores
    for col in ['RF_Importance', 'MI_Score', 'F_Score']:
        importance_df[f'{col}_norm'] = importance_df[col] / importance_df[col].max()
    
    # Average normalized importance
    importance_df['Avg_Importance'] = importance_df[['RF_Importance_norm', 'MI_Score_norm', 'F_Score_norm']].mean(axis=1)
    importance_df = importance_df.sort_values('Avg_Importance', ascending=False)
    
    print(importance_df[['Feature', 'Avg_Importance']])
    
    # 2. FEATURE ENGINEERING
    print("\n2. ADVANCED FEATURE ENGINEERING")
    print("-"*40)
    
    # Create new features
    X_engineered = X_ml.copy()
    
    # Add interaction features
    X_engineered['AL_K_interaction'] = X_ml['Bio-AL'] * X_ml['K_avg_Kerato']
    X_engineered['Posterior_CCT_ratio'] = X_ml['Posterior Km'] / X_ml['CCT']
    X_engineered['AL_CCT_ratio'] = X_ml['Bio-AL'] / X_ml['CCT']
    X_engineered['K_diff_CCT'] = X_ml['K_Astigmatism_Kerato'] * X_ml['CCT']
    
    # Add polynomial features for most important variables
    X_engineered['AL_squared'] = X_ml['Bio-AL'] ** 2
    X_engineered['K_squared'] = X_ml['K_avg_Kerato'] ** 2
    X_engineered['Posterior_squared'] = X_ml['Posterior Km'] ** 2
    
    print(f"Total features after engineering: {X_engineered.shape[1]}")
    
    # 3. SIMPLIFIED HYPERPARAMETER OPTIMIZATION
    print("\n3. SIMPLIFIED HYPERPARAMETER OPTIMIZATION")
    print("-"*40)
    
    # Test fewer configurations for speed
    param_grids = {
        'Ridge': {
            'model__alpha': [0.1, 1.0, 10.0]
        },
        'Huber': {
            'model__epsilon': [1.2, 1.35, 1.5],
            'model__alpha': [0.001, 0.01]
        }
    }
    
    # Test only RobustScaler
    scaler = RobustScaler()
    
    results_optimized = {}
    
    # LOO cross-validation
    loo = LeaveOneOut()
    
    # Test with fewer feature combinations
    print("\nTesting Ridge with different feature counts...")
    for n_features in [5, 7, X_engineered.shape[1]]:
        print(f"\nTesting with {n_features} features...")
        
        # Create pipeline
        if n_features < X_engineered.shape[1]:
            pipeline = Pipeline([
                ('scaler', scaler),
                ('feature_selection', SelectKBest(f_regression, k=n_features)),
                ('model', Ridge())
            ])
        else:
            pipeline = Pipeline([
                ('scaler', scaler),
                ('model', Ridge())
            ])
        
        # Test Ridge with this configuration
        predictions = []
        actuals = []
        
        # Show progress
        print(f"Running LOO cross-validation ({len(X_engineered)} iterations)...")
        for i, (train_idx, test_idx) in enumerate(loo.split(X_engineered)):
            if i % 20 == 0:
                print(f"Progress: {i}/{len(X_engineered)}")
            
            X_train, X_test = X_engineered.iloc[train_idx], X_engineered.iloc[test_idx]
            y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
            
            # Simple grid search with fewer parameters
            grid = GridSearchCV(pipeline, param_grids['Ridge'], cv=3, scoring='neg_mean_absolute_error')
            grid.fit(X_train, y_train)
            
            # Predict on test
            predictions.append(grid.predict(X_test)[0])
            actuals.append(y_test.values[0])
        
        mae = mean_absolute_error(actuals, predictions)
        config_name = f"RobustScaler_{n_features}_Features"
        results_optimized[config_name] = mae
        print(f"MAE: {mae:.3f} D")
    
    # 4. TEST OTHER MODELS (SIMPLIFIED)
    print("\n4. TESTING OTHER MODELS")
    print("-"*40)
    
    # Select best feature count (usually 7 works well)
    n_best_features = 7
    selector_best = SelectKBest(f_regression, k=n_best_features)
    
    # Prepare data with best preprocessing
    X_scaled = scaler.fit_transform(X_engineered)
    X_selected = selector_best.fit_transform(X_scaled, y_ml)
    
    # Get selected feature names
    selected_features = X_engineered.columns[selector_best.get_support()]
    print(f"\nSelected top {n_best_features} features: {list(selected_features)}")
    
    # Test a few key models
    simple_models = {
        'Ridge_Optimized': Ridge(alpha=1.0),
        'Huber_Optimized': HuberRegressor(epsilon=1.35),
        'BayesianRidge_Optimized': BayesianRidge()
    }
    
    for name, model in simple_models.items():
        print(f"\nTesting {name}...")
        predictions = []
        actuals = []
        
        for train_idx, test_idx in loo.split(X_selected):
            X_train, X_test = X_selected[train_idx], X_selected[test_idx]
            y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
            
            model.fit(X_train, y_train)
            predictions.append(model.predict(X_test)[0])
            actuals.append(y_test.values[0])
        
        mae = mean_absolute_error(actuals, predictions)
        results_optimized[name] = mae
        print(f"MAE: {mae:.3f} D")
    
    # 5. RESULTS SUMMARY
    print("\n" + "="*80)
    print("OPTIMIZATION RESULTS SUMMARY")
    print("="*80)
    
    # Sort and display results
    sorted_results = sorted(results_optimized.items(), key=lambda x: x[1])
    
    print("\nAll configurations tested:")
    for i, (config, mae) in enumerate(sorted_results, 1):
        improvement = 1.455 - mae
        print(f"{i}. {config}: MAE = {mae:.3f} D (Improvement: {improvement:+.3f} D)")
    
    best_config = sorted_results[0][0]
    best_mae = sorted_results[0][1]
    
    print(f"\nBEST CONFIGURATION: {best_config}")
    print(f"MAE: {best_mae:.3f} D")
    print(f"Improvement over original Ridge: {1.455 - best_mae:.3f} D")
    
    # 6. SIMPLE VISUALIZATION
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Feature importance plot
    ax1.barh(importance_df['Feature'][:7], importance_df['Avg_Importance'][:7])
    ax1.set_xlabel('Average Normalized Importance')
    ax1.set_title('Top 7 Feature Importance')
    
    # Model comparison
    ax2.bar(range(len(sorted_results[:5])), [x[1] for x in sorted_results[:5]])
    ax2.axhline(y=1.455, color='red', linestyle='--', label='Original Ridge')
    ax2.set_xticks(range(len(sorted_results[:5])))
    ax2.set_xticklabels([x[0].split('_')[0] for x in sorted_results[:5]], rotation=45, ha='right')
    ax2.set_ylabel('MAE (D)')
    ax2.set_title('Top 5 Model Configurations')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("\nOptimization complete!")
    
else:
    print("Not enough data for optimization")

In [None]:
# Cell 14: Specialized Approaches for FacoDMEK Eyes - CORRECTED VERSION
# Fixed index alignment issues in LOO cross-validation

import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy import stats
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import seaborn as sns

if len(df_ml) > 10:
    # Prepare base data
    features_ml = ['Bio-AL', 'K_avg_Kerato', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Kerato', 'A-Constant']
    
    df_analysis = df_ml[features_ml + ['True_IOL', 'SRKT_Prediction', 'SRKT_Error']].dropna().copy()
    
    print(f"Analyzing {len(df_analysis)} FacoDMEK eyes")
    print("="*80)
    
    # 1. ANALYZE SYSTEMATIC BIAS IN SRK/T
    print("1. SYSTEMATIC BIAS ANALYSIS")
    print("-"*40)
    
    # Group by key characteristics
    # Axial length groups
    df_analysis['AL_group'] = pd.cut(df_analysis['Bio-AL'], 
                                     bins=[0, 22, 24, 26, 100], 
                                     labels=['Short', 'Normal', 'Long', 'Very Long'])
    
    # Posterior K groups
    df_analysis['PostK_group'] = pd.cut(df_analysis['Posterior Km'], 
                                        bins=[-10, -7, -6, -5, 0], 
                                        labels=['Very Steep', 'Steep', 'Normal', 'Flat'])
    
    # Calculate bias by groups
    print("\nMean SRK/T Error by Axial Length:")
    al_bias = df_analysis.groupby('AL_group')['SRKT_Error'].agg(['mean', 'std', 'count'])
    print(al_bias)
    
    print("\nMean SRK/T Error by Posterior K:")
    postk_bias = df_analysis.groupby('PostK_group')['SRKT_Error'].agg(['mean', 'std', 'count'])
    print(postk_bias)
    
    # 2. SEGMENTED CORRECTION MODELS - FIXED VERSION
    print("\n2. SEGMENTED CORRECTION APPROACH")
    print("-"*40)
    
    # Create different models for different eye characteristics
    loo = LeaveOneOut()
    segmented_predictions = []
    actuals = []
    model_used = []
    
    # Reset index to ensure proper alignment
    df_analysis_reset = df_analysis.reset_index(drop=True)
    
    for train_idx, test_idx in loo.split(df_analysis_reset):
        train_data = df_analysis_reset.iloc[train_idx].copy()
        test_data = df_analysis_reset.iloc[test_idx]
        
        test_al = test_data['Bio-AL'].values[0]
        test_postk = test_data['Posterior Km'].values[0]
        
        # Determine which segment the test eye belongs to
        if test_al < 23:
            segment = 'short'
            # Create mask using values, not boolean indexing
            segment_indices = train_data[train_data['Bio-AL'] < 23.5].index
        elif test_al > 25:
            segment = 'long'
            segment_indices = train_data[train_data['Bio-AL'] > 24.5].index
        else:
            segment = 'normal'
            segment_indices = train_data.index
        
        # If not enough eyes in segment, use all
        if len(segment_indices) < 5:
            segment_indices = train_data.index
            segment = 'all'
        
        # Train correction model for this segment
        X_segment = train_data.loc[segment_indices, ['Posterior Km', 'CCT', 'Post_Ant_Ratio']]
        y_segment = train_data.loc[segment_indices, 'SRKT_Error']
        
        # Apply RobustScaler
        scaler_segment = RobustScaler()
        
        if len(X_segment) > 3:
            X_segment_scaled = scaler_segment.fit_transform(X_segment)
            model = Ridge(alpha=1.0)
            model.fit(X_segment_scaled, y_segment)
            
            # Predict correction
            X_test = test_data[['Posterior Km', 'CCT', 'Post_Ant_Ratio']].values.reshape(1, -1)
            X_test_scaled = scaler_segment.transform(X_test)
            correction = model.predict(X_test_scaled)[0]
        else:
            # Fall back to mean correction
            correction = y_segment.mean()
        
        # Apply correction
        srkt_pred = test_data['SRKT_Prediction'].values[0]
        final_pred = srkt_pred - correction
        
        segmented_predictions.append(final_pred)
        actuals.append(test_data['True_IOL'].values[0])
        model_used.append(segment)
    
    mae_segmented = mean_absolute_error(actuals, segmented_predictions)
    print(f"Segmented Model MAE: {mae_segmented:.3f} D")
    
    # 3. POSTERIOR K-WEIGHTED CORRECTION
    print("\n3. POSTERIOR K-WEIGHTED CORRECTION")
    print("-"*40)
    
    # The idea: weight the correction based on how abnormal the posterior K is
    mean_post_k = df_analysis_reset['Posterior Km'].mean()
    std_post_k = df_analysis_reset['Posterior Km'].std()
    
    weighted_predictions = []
    
    for train_idx, test_idx in loo.split(df_analysis_reset):
        train_data = df_analysis_reset.iloc[train_idx]
        test_data = df_analysis_reset.iloc[test_idx]
        
        # Calculate how abnormal the test eye's posterior K is
        test_post_k = test_data['Posterior Km'].values[0]
        z_score = abs((test_post_k - mean_post_k) / std_post_k)
        
        # Weight factor (higher weight for more abnormal eyes)
        weight = 1 + (z_score * 0.5)  # Adjust multiplier as needed
        
        # Train model with RobustScaler
        X_train = train_data[['Posterior Km', 'CCT']]
        y_train = train_data['SRKT_Error']
        
        scaler_weighted = RobustScaler()
        X_train_scaled = scaler_weighted.fit_transform(X_train)
        
        model = Ridge(alpha=0.5)
        model.fit(X_train_scaled, y_train)
        
        # Predict and apply weighted correction
        X_test = test_data[['Posterior Km', 'CCT']].values.reshape(1, -1)
        X_test_scaled = scaler_weighted.transform(X_test)
        correction = model.predict(X_test_scaled)[0] * weight
        
        srkt_pred = test_data['SRKT_Prediction'].values[0]
        final_pred = srkt_pred - correction
        
        weighted_predictions.append(final_pred)
    
    mae_weighted = mean_absolute_error(actuals, weighted_predictions)
    print(f"Posterior K-Weighted Model MAE: {mae_weighted:.3f} D")
    
    # 4. RATIO-BASED CORRECTION MODEL
    print("\n4. RATIO-BASED CORRECTION MODEL")
    print("-"*40)
    
    # Create ratio features that might better capture FacoDMEK characteristics
    df_ratios = df_analysis_reset.copy()
    df_ratios['K_to_AL_ratio'] = df_ratios['K_avg_Kerato'] / df_ratios['Bio-AL']
    df_ratios['PostK_to_K_ratio'] = df_ratios['Posterior Km'] / df_ratios['K_avg_Kerato']
    df_ratios['CCT_to_AL_ratio'] = df_ratios['CCT'] / df_ratios['Bio-AL']
    df_ratios['Corneal_contribution'] = (df_ratios['K_avg_Kerato'] - 43.5) * 0.8  # Deviation from normal
    
    ratio_features = ['K_to_AL_ratio', 'PostK_to_K_ratio', 'CCT_to_AL_ratio', 
                     'Corneal_contribution', 'Post_Ant_Ratio']
    
    ratio_predictions = []
    
    for train_idx, test_idx in loo.split(df_ratios):
        X_train = df_ratios.iloc[train_idx][ratio_features]
        X_test = df_ratios.iloc[test_idx][ratio_features]
        y_train = df_ratios.iloc[train_idx]['True_IOL']
        
        # Apply RobustScaler
        scaler_ratio = RobustScaler()
        X_train_scaled = scaler_ratio.fit_transform(X_train)
        X_test_scaled = scaler_ratio.transform(X_test.values.reshape(1, -1))
        
        model = Ridge(alpha=1.0)
        model.fit(X_train_scaled, y_train)
        
        pred = model.predict(X_test_scaled)[0]
        ratio_predictions.append(pred)
    
    mae_ratio = mean_absolute_error(actuals, ratio_predictions)
    print(f"Ratio-Based Model MAE: {mae_ratio:.3f} D")
    
    # 5. CORRECTION FORMULA OPTIMIZATION
    print("\n5. OPTIMIZED CORRECTION FORMULA")
    print("-"*40)
    
    # Optimize a simple correction formula: IOL = SRK/T + a*PostK + b*CCT + c*AL + d
    def correction_formula(params, X, y_true, srkt_pred):
        a, b, c, d = params
        correction = a * X[:, 0] + b * X[:, 1] + c * X[:, 2] + d
        predictions = srkt_pred - correction
        return mean_absolute_error(y_true, predictions)
    
    # Prepare data for optimization
    X_opt = df_analysis_reset[['Posterior Km', 'CCT', 'Bio-AL']].values
    y_true = df_analysis_reset['True_IOL'].values
    srkt_pred = df_analysis_reset['SRKT_Prediction'].values
    
    # Optimize
    initial_params = [0.1, 0.001, 0.1, 0]
    result = minimize(correction_formula, initial_params, 
                     args=(X_opt, y_true, srkt_pred),
                     method='Nelder-Mead')
    
    optimal_params = result.x
    
    # Test with LOO
    optimized_predictions = []
    
    for train_idx, test_idx in loo.split(df_analysis_reset):
        X_train = df_analysis_reset.iloc[train_idx][['Posterior Km', 'CCT', 'Bio-AL']].values
        y_train = df_analysis_reset.iloc[train_idx]['True_IOL'].values
        srkt_train = df_analysis_reset.iloc[train_idx]['SRKT_Prediction'].values
        
        # Re-optimize on training set
        result_cv = minimize(correction_formula, initial_params, 
                           args=(X_train, y_train, srkt_train),
                           method='Nelder-Mead')
        
        # Apply to test
        test_data = df_analysis_reset.iloc[test_idx]
        X_test = test_data[['Posterior Km', 'CCT', 'Bio-AL']].values
        
        a, b, c, d = result_cv.x
        correction = a * X_test[0] + b * X_test[1] + c * X_test[2] + d
        
        final_pred = test_data['SRKT_Prediction'].values[0] - correction
        optimized_predictions.append(final_pred)
    
    mae_optimized = mean_absolute_error(actuals, optimized_predictions)
    print(f"Optimized Formula MAE: {mae_optimized:.3f} D")
    
    print(f"\nOptimized Correction Formula:")
    print(f"IOL = SRK/T - ({optimal_params[0]:.4f} × Posterior_K + "
          f"{optimal_params[1]:.6f} × CCT + {optimal_params[2]:.4f} × AL + {optimal_params[3]:.3f})")
    
    # 6. RESULTS COMPARISON
    print("\n" + "="*80)
    print("SPECIALIZED MODELS COMPARISON")
    print("="*80)
    
    results_specialized = {
        'Original SRK/T': df_analysis_reset['SRKT_Error'].abs().mean(),
        'Segmented Model': mae_segmented,
        'Posterior K-Weighted': mae_weighted,
        'Ratio-Based Model': mae_ratio,
        'Optimized Formula': mae_optimized
    }
    
    # Sort by performance
    sorted_specialized = sorted(results_specialized.items(), key=lambda x: x[1])
    
    print("\nModel Performance:")
    for model, mae in sorted_specialized:
        improvement = 1.455 - mae
        print(f"{model}: MAE = {mae:.3f} D (Improvement: {improvement:+.3f} D)")
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Error by AL group
    ax1 = axes[0, 0]
    df_analysis_reset.boxplot(column='SRKT_Error', by='AL_group', ax=ax1)
    ax1.set_xlabel('Axial Length Group')
    ax1.set_ylabel('SRK/T Error (D)')
    ax1.set_title('SRK/T Error Distribution by AL Group')
    ax1.axhline(y=0, color='red', linestyle='--')
    
    # Error vs Posterior K
    ax2 = axes[0, 1]
    scatter = ax2.scatter(df_analysis_reset['Posterior Km'], df_analysis_reset['SRKT_Error'], 
                         c=df_analysis_reset['Bio-AL'], cmap='viridis', alpha=0.6)
    ax2.set_xlabel('Posterior K (D)')
    ax2.set_ylabel('SRK/T Error (D)')
    ax2.set_title('Error vs Posterior K (colored by AL)')
    ax2.axhline(y=0, color='red', linestyle='--')
    plt.colorbar(scatter, ax=ax2, label='Axial Length (mm)')
    
    # Model comparison
    ax3 = axes[1, 0]
    models = list(results_specialized.keys())
    maes = list(results_specialized.values())
    bars = ax3.bar(range(len(models)), maes, color=['red', 'orange', 'yellow', 'lightgreen', 'green'])
    ax3.set_xticks(range(len(models)))
    ax3.set_xticklabels(models, rotation=45, ha='right')
    ax3.set_ylabel('MAE (D)')
    ax3.set_title('Specialized Model Comparison')
    ax3.axhline(y=1.455, color='blue', linestyle='--', label='Original Ridge')
    ax3.legend()
    
    # Best model error distribution
    ax4 = axes[1, 1]
    best_errors = np.array(optimized_predictions) - np.array(actuals)
    ax4.hist(best_errors, bins=15, edgecolor='black', alpha=0.7, color='green')
    ax4.axvline(x=0, color='red', linestyle='--')
    ax4.set_xlabel('Prediction Error (D)')
    ax4.set_ylabel('Frequency')
    ax4.set_title(f'Optimized Formula Error Distribution (MAE: {mae_optimized:.3f} D)')
    
    # Add normal distribution overlay
    mu, std = stats.norm.fit(best_errors)
    x = np.linspace(best_errors.min(), best_errors.max(), 100)
    ax4.plot(x, stats.norm.pdf(x, mu, std) * len(best_errors) * (best_errors.max() - best_errors.min()) / 15, 
             'r-', linewidth=2, label=f'Normal fit (μ={mu:.2f}, σ={std:.2f})')
    ax4.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Final recommendation
    print("\n" + "="*80)
    print("RECOMMENDATION FOR CLINICAL USE")
    print("="*80)
    
    best_model = sorted_specialized[0][0]
    best_mae = sorted_specialized[0][1]
    
    print(f"Best performing model: {best_model} with MAE: {best_mae:.3f} D")
    
    if best_model == "Optimized Formula":
        print("\nClinical Formula:")
        print("1. Calculate standard SRK/T using keratometry K values")
        print("2. Apply correction:")
        print(f"   Correction = {optimal_params[0]:.3f} × Posterior_K + "
              f"{optimal_params[1]:.5f} × CCT + {optimal_params[2]:.3f} × AL + {optimal_params[3]:.2f}")
        print("3. Modified IOL = SRK/T - Correction")
        
    # Calculate final accuracy metrics
    errors_abs = np.abs(best_errors)
    print(f"\nAccuracy with {best_model}:")
    print(f"Within ±0.25 D: {(errors_abs <= 0.25).sum() / len(errors_abs) * 100:.1f}%")
    print(f"Within ±0.50 D: {(errors_abs <= 0.50).sum() / len(errors_abs) * 100:.1f}%")
    print(f"Within ±1.00 D: {(errors_abs <= 1.00).sum() / len(errors_abs) * 100:.1f}%")
    
else:
    print("Not enough data for specialized analysis")