In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.optimize import minimize
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

In [None]:
# Cell 2: Load and Explore Data
# Load the Excel file
df = pd.read_excel('FacoDMEK.xlsx', sheet_name='Cleaned Data')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Cell 3: Data Preprocessing with Better NaN Handling
# Calculate average keratometry - BIOMETRY VERSION
df['K_avg_Bio'] = (df['Bio-Ks'] + df['Bio-Kf']) / 2

# Calculate average keratometry - TOPOGRAPHY VERSION
df['K_avg_Topo'] = (df['Keratometric Ks'] + df['Keratometric Kf']) / 2

# Calculate the "true" IOL power that would have achieved emmetropia
df['True_IOL'] = df['IOL Power'] - df['PostOP Spherical Equivalent']

# Feature engineering
df['Post_Ant_Ratio'] = df['Posterior Km'] / df['Anterior Km']
df['K_Astigmatism_Bio'] = df['Bio-Ks'] - df['Bio-Kf']
df['K_Astigmatism_Topo'] = df['Keratometric Ks'] - df['Keratometric Kf']
df['CCT_Normalized'] = df['CCT'] / 550  # Using OCT CCT only
df['K_Diff_Bio_Topo'] = df['K_avg_Bio'] - df['K_avg_Topo']

# Display summary statistics
print("Summary of key variables:")
print(df[['Bio-AL', 'K_avg_Bio', 'IOL Power', 'PostOP Spherical Equivalent', 'True_IOL', 'CCT']].describe())

# Check for missing values
print("\nMissing values in key columns:")
missing_counts = df[['Bio-AL', 'K_avg_Bio', 'K_avg_Topo', 'IOL Power', 
                     'PostOP Spherical Equivalent', 'True_IOL', 'CCT', 'A-Constant']].isnull().sum()
print(missing_counts)

# Count how many complete cases we have
complete_cases = df[['Bio-AL', 'K_avg_Bio', 'IOL Power', 'PostOP Spherical Equivalent', 'A-Constant']].notna().all(axis=1).sum()
print(f"\nComplete cases for analysis: {complete_cases} out of {len(df)}")

In [None]:
# Cell 4: Implement SRK/T Formula with NaN Handling
def calculate_SRKT(AL, K, A_const, nc=1.333):
    """
    Calculate IOL power using SRK/T formula
    Returns NaN if inputs are invalid
    """
    # Check for valid inputs
    if pd.isna(AL) or pd.isna(K) or pd.isna(A_const) or K <= 0 or AL <= 0:
        return np.nan
    
    try:
        # Constants
        na = 1.336
        V = 12
        
        # Corneal radius
        r = 337.5 / K
        
        # Axial length correction
        if AL <= 24.2:
            LCOR = AL
        else:
            LCOR = 3.446 + 1.716 * AL - 0.0237 * AL**2
        
        # Corneal width
        Cw = -5.41 + 0.58412 * LCOR + 0.098 * K
        
        # Corneal height
        H = r - np.sqrt(r**2 - (Cw**2 / 4))
        
        # ACD constant from A-constant
        ACDconst = 0.62467 * A_const - 68.747
        
        # Offset
        offset = ACDconst - 3.336
        
        # Estimated postoperative ACD
        ACDest = H + offset
        
        # Retinal thickness correction
        RETHICK = 0.65696 - 0.02029 * AL
        LOPT = AL + RETHICK
        
        # Calculate IOL power for emmetropia
        ncm1 = nc - 1
        IOL = (1000 * na * (na * r - ncm1 * LOPT)) / ((LOPT - ACDest) * (na * r - ncm1 * ACDest))
        
        return IOL
    except:
        return np.nan

# Calculate SRK/T predictions for both versions
# BIOMETRY VERSION
df['SRKT_Prediction_Bio'] = df.apply(
    lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Bio'], row['A-Constant']), 
    axis=1
)
df['SRKT_Error_Bio'] = df['SRKT_Prediction_Bio'] - df['True_IOL']

# TOPOGRAPHY VERSION
df['SRKT_Prediction_Topo'] = df.apply(
    lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Topo'], row['A-Constant']), 
    axis=1
)
df['SRKT_Error_Topo'] = df['SRKT_Prediction_Topo'] - df['True_IOL']

# Remove rows with NaN errors for analysis
valid_bio = df['SRKT_Error_Bio'].notna()
valid_topo = df['SRKT_Error_Topo'].notna()

print("BIOMETRY VERSION:")
print(f"Valid predictions: {valid_bio.sum()} out of {len(df)}")
if valid_bio.sum() > 0:
    print(f"Mean Error: {df.loc[valid_bio, 'SRKT_Error_Bio'].mean():.3f} D")
    print(f"Mean Absolute Error: {df.loc[valid_bio, 'SRKT_Error_Bio'].abs().mean():.3f} D")
    print(f"Standard Deviation: {df.loc[valid_bio, 'SRKT_Error_Bio'].std():.3f} D")

print("\nTOPOGRAPHY VERSION:")
print(f"Valid predictions: {valid_topo.sum()} out of {len(df)}")
if valid_topo.sum() > 0:
    print(f"Mean Error: {df.loc[valid_topo, 'SRKT_Error_Topo'].mean():.3f} D")
    print(f"Mean Absolute Error: {df.loc[valid_topo, 'SRKT_Error_Topo'].abs().mean():.3f} D")
    print(f"Standard Deviation: {df.loc[valid_topo, 'SRKT_Error_Topo'].std():.3f} D")

In [None]:
# Cell 5: Visualize SRK/T Performance (with NaN handling)
# Only plot for rows with valid data
df_valid_bio = df[df['SRKT_Error_Bio'].notna()].copy()
df_valid_topo = df[df['SRKT_Error_Topo'].notna()].copy()

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: Biometry Error Distribution
if len(df_valid_bio) > 0:
    axes[0, 0].hist(df_valid_bio['SRKT_Error_Bio'], bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].axvline(x=0, color='red', linestyle='--')
    axes[0, 0].set_xlabel('SRK/T Prediction Error (D)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title(f'Biometry Error Distribution (n={len(df_valid_bio)})')

# Plot 2: Topography Error Distribution
if len(df_valid_topo) > 0:
    axes[0, 1].hist(df_valid_topo['SRKT_Error_Topo'], bins=20, edgecolor='black', alpha=0.7, color='green')
    axes[0, 1].axvline(x=0, color='red', linestyle='--')
    axes[0, 1].set_xlabel('SRK/T Prediction Error (D)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title(f'Topography Error Distribution (n={len(df_valid_topo)})')

# Plot 3: Error vs Posterior Corneal Power (Biometry)
if len(df_valid_bio) > 0:
    axes[1, 0].scatter(df_valid_bio['Posterior Km'], df_valid_bio['SRKT_Error_Bio'], alpha=0.6)
    axes[1, 0].set_xlabel('Posterior Corneal Power (D)')
    axes[1, 0].set_ylabel('SRK/T Error (D)')
    axes[1, 0].set_title('Biometry: Error vs Posterior K')

# Plot 4: Error vs CCT
if len(df_valid_bio) > 0:
    valid_cct = df_valid_bio[df_valid_bio['CCT'].notna()]
    axes[1, 1].scatter(valid_cct['CCT'], valid_cct['SRKT_Error_Bio'], alpha=0.6)
    axes[1, 1].set_xlabel('Central Corneal Thickness (μm)')
    axes[1, 1].set_ylabel('SRK/T Error (D)')
    axes[1, 1].set_title('Error vs CCT')

plt.tight_layout()
plt.show()

# Calculate percentage within target ranges
if len(df_valid_bio) > 0:
    within_025 = (df_valid_bio['SRKT_Error_Bio'].abs() <= 0.25).sum() / len(df_valid_bio) * 100
    within_050 = (df_valid_bio['SRKT_Error_Bio'].abs() <= 0.50).sum() / len(df_valid_bio) * 100
    within_100 = (df_valid_bio['SRKT_Error_Bio'].abs() <= 1.00).sum() / len(df_valid_bio) * 100
    
    print(f"\nBiometry - Percentage of eyes within target:")
    print(f"±0.25 D: {within_025:.1f}%")
    print(f"±0.50 D: {within_050:.1f}%")
    print(f"±1.00 D: {within_100:.1f}%")

In [None]:
# Cell 6: Optimization Approach 1 - Optimize Corneal Refractive Index
# Only use complete cases
df_complete = df[df['SRKT_Error_Bio'].notna()].copy()

def objective_nc(nc_value):
    """Objective function to minimize MAE by optimizing nc"""
    predictions = df_complete.apply(
        lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Bio'], row['A-Constant'], nc=nc_value[0]), 
        axis=1
    )
    errors = predictions - df_complete['True_IOL']
    # Remove any NaN values that might occur
    valid_errors = errors[errors.notna()]
    if len(valid_errors) == 0:
        return 999  # Return large value if no valid predictions
    return np.mean(np.abs(valid_errors))

# Optimize nc
if len(df_complete) > 0:
    result_nc = minimize(objective_nc, x0=[1.333], bounds=[(1.330, 1.340)], method='L-BFGS-B')
    optimal_nc = result_nc.x[0]
    
    # Recalculate with optimal nc
    df_complete['SRKT_Optimized_nc'] = df_complete.apply(
        lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Bio'], row['A-Constant'], nc=optimal_nc), 
        axis=1
    )
    df_complete['SRKT_Error_Optimized_nc'] = df_complete['SRKT_Optimized_nc'] - df_complete['True_IOL']
    
    print(f"Optimal corneal refractive index: {optimal_nc:.4f}")
    print(f"Original MAE: {df_complete['SRKT_Error_Bio'].abs().mean():.3f} D")
    print(f"Optimized MAE: {df_complete['SRKT_Error_Optimized_nc'].abs().mean():.3f} D")
else:
    print("Not enough complete cases for optimization")

In [None]:
# Cell 6.5: Missing Data Analysis and Imputation (CORRECTED)
# Analyze missing data patterns
print("MISSING DATA ANALYSIS")
print("="*50)

# Check missing values in ALL relevant columns
all_columns = ['Bio-AL', 'Bio-Ks', 'Bio-Kf', 'K_avg_Bio', 'Posterior Km', 'CCT', 
               'A-Constant', 'Post_Ant_Ratio', 'K_Astigmatism_Bio',
               'Keratometric Ks', 'Keratometric Kf', 'K_avg_Topo',
               'Anterior Km', 'Anterior Ks', 'Anterior Kf']

missing_summary = pd.DataFrame({
    'Column': all_columns,
    'Missing': [df[col].isna().sum() for col in all_columns if col in df.columns],
    'Percentage': [(df[col].isna().sum()/len(df)*100) for col in all_columns if col in df.columns]
})
missing_summary = missing_summary[missing_summary['Missing'] > 0].sort_values('Missing', ascending=False)
print(missing_summary)

# Create a copy for imputation
df_imputed = df.copy()

# STRATEGY 1: Impute Bio-Ks and Bio-Kf from Keratometric values
print("\nSTRATEGY 1: Imputing Bio-Ks and Bio-Kf from Topography")
# Check if we have topography values when biometry is missing
bio_ks_missing = df_imputed['Bio-Ks'].isna()
bio_kf_missing = df_imputed['Bio-Kf'].isna()
topo_ks_available = df_imputed['Keratometric Ks'].notna()
topo_kf_available = df_imputed['Keratometric Kf'].notna()

# Calculate average differences for cases with both measurements
complete_k_cases = df_imputed[(df_imputed['Bio-Ks'].notna()) & 
                               (df_imputed['Keratometric Ks'].notna())]
if len(complete_k_cases) > 0:
    ks_diff = (complete_k_cases['Bio-Ks'] - complete_k_cases['Keratometric Ks']).mean()
    kf_diff = (complete_k_cases['Bio-Kf'] - complete_k_cases['Keratometric Kf']).mean()
    print(f"Average difference Bio-Ks - Keratometric Ks: {ks_diff:.3f} D")
    print(f"Average difference Bio-Kf - Keratometric Kf: {kf_diff:.3f} D")
    
    # Impute Bio-Ks
    can_impute_ks = bio_ks_missing & topo_ks_available
    if can_impute_ks.sum() > 0:
        df_imputed.loc[can_impute_ks, 'Bio-Ks'] = df_imputed.loc[can_impute_ks, 'Keratometric Ks'] + ks_diff
        print(f"Imputed {can_impute_ks.sum()} Bio-Ks values")
    
    # Impute Bio-Kf
    can_impute_kf = bio_kf_missing & topo_kf_available
    if can_impute_kf.sum() > 0:
        df_imputed.loc[can_impute_kf, 'Bio-Kf'] = df_imputed.loc[can_impute_kf, 'Keratometric Kf'] + kf_diff
        print(f"Imputed {can_impute_kf.sum()} Bio-Kf values")

# Recalculate K_avg_Bio
df_imputed['K_avg_Bio'] = (df_imputed['Bio-Ks'] + df_imputed['Bio-Kf']) / 2

# STRATEGY 2: Impute remaining Bio-K values using anterior corneal power
print("\nSTRATEGY 2: Imputing remaining Bio-K from Anterior Corneal Power")
bio_k_still_missing = df_imputed['K_avg_Bio'].isna()
anterior_available = df_imputed['Anterior Km'].notna()

if bio_k_still_missing.sum() > 0 and (bio_k_still_missing & anterior_available).sum() > 0:
    # Calculate relationship between anterior K and bio K
    complete_anterior = df_imputed[(df_imputed['K_avg_Bio'].notna()) & 
                                   (df_imputed['Anterior Km'].notna())]
    if len(complete_anterior) > 5:
        k_anterior_diff = (complete_anterior['K_avg_Bio'] - complete_anterior['Anterior Km']).mean()
        can_use_anterior = bio_k_still_missing & anterior_available
        df_imputed.loc[can_use_anterior, 'K_avg_Bio'] = df_imputed.loc[can_use_anterior, 'Anterior Km'] + k_anterior_diff
        print(f"Imputed {can_use_anterior.sum()} K_avg_Bio values from Anterior K")

# STRATEGY 3: Regression imputation for Posterior Km
print("\nSTRATEGY 3: Regression for Posterior Corneal Power")
posterior_complete = df_imputed[['Anterior Km', 'CCT', 'Posterior Km']].dropna()
if len(posterior_complete) > 10 and df_imputed['Posterior Km'].isna().sum() > 0:
    X_post = posterior_complete[['Anterior Km', 'CCT']]
    y_post = posterior_complete['Posterior Km']
    reg_posterior = LinearRegression()
    reg_posterior.fit(X_post, y_post)
    
    posterior_missing = df_imputed['Posterior Km'].isna()
    anterior_cct_available = df_imputed['Anterior Km'].notna() & df_imputed['CCT'].notna()
    can_impute_posterior = posterior_missing & anterior_cct_available
    
    if can_impute_posterior.sum() > 0:
        X_pred = df_imputed.loc[can_impute_posterior, ['Anterior Km', 'CCT']]
        df_imputed.loc[can_impute_posterior, 'Posterior Km'] = reg_posterior.predict(X_pred)
        print(f"Imputed {can_impute_posterior.sum()} Posterior Km values")

# STRATEGY 4: Simple imputation for remaining variables
print("\nSTRATEGY 4: Median/Mode imputation")

# CCT - median
if df_imputed['CCT'].isna().sum() > 0:
    cct_median = df_imputed['CCT'].median()
    n_cct = df_imputed['CCT'].isna().sum()
    df_imputed['CCT'].fillna(cct_median, inplace=True)
    print(f"Imputed {n_cct} CCT values with median: {cct_median:.0f} μm")

# A-Constant - mode
if df_imputed['A-Constant'].isna().sum() > 0:
    a_constant_mode = df_imputed['A-Constant'].mode()[0]
    n_a = df_imputed['A-Constant'].isna().sum()
    df_imputed['A-Constant'].fillna(a_constant_mode, inplace=True)
    print(f"Imputed {n_a} A-Constant values with mode: {a_constant_mode:.2f}")

# Bio-AL - NO imputation (too critical)
if df_imputed['Bio-AL'].isna().sum() > 0:
    print(f"\nWARNING: {df_imputed['Bio-AL'].isna().sum()} Bio-AL values are missing and will NOT be imputed")

# STRATEGY 5: Recalculate derived variables
print("\nSTRATEGY 5: Recalculating derived variables")

# K_Astigmatism_Bio
df_imputed['K_Astigmatism_Bio'] = df_imputed['Bio-Ks'] - df_imputed['Bio-Kf']

# Post_Ant_Ratio
has_both = df_imputed['Posterior Km'].notna() & df_imputed['Anterior Km'].notna()
df_imputed.loc[has_both, 'Post_Ant_Ratio'] = (
    df_imputed.loc[has_both, 'Posterior Km'] / df_imputed.loc[has_both, 'Anterior Km']
)

# Recalculate SRK/T predictions with imputed data
print("\nRecalculating SRK/T with imputed data...")
df_imputed['SRKT_Prediction_Bio_Imputed'] = df_imputed.apply(
    lambda row: calculate_SRKT(row['Bio-AL'], row['K_avg_Bio'], row['A-Constant']) 
    if pd.notna(row['Bio-AL']) and pd.notna(row['K_avg_Bio']) and pd.notna(row['A-Constant']) 
    else np.nan, axis=1
)
df_imputed['SRKT_Error_Bio_Imputed'] = df_imputed['SRKT_Prediction_Bio_Imputed'] - df_imputed['True_IOL']

# Final summary
print("\n" + "="*50)
print("IMPUTATION RESULTS SUMMARY")
print("="*50)
print(f"Original complete cases: {len(df[df['SRKT_Error_Bio'].notna()])}")
print(f"Complete cases after imputation: {len(df_imputed[df_imputed['SRKT_Error_Bio_Imputed'].notna()])}")
print(f"Additional cases gained: {len(df_imputed[df_imputed['SRKT_Error_Bio_Imputed'].notna()]) - len(df[df['SRKT_Error_Bio'].notna()])}")

# Check which columns still have missing values for ML features
ml_features = ['Bio-AL', 'K_avg_Bio', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 'K_Astigmatism_Bio', 'A-Constant']
for feat in ml_features:
    n_missing = df_imputed[feat].isna().sum()
    if n_missing > 0:
        print(f"  - {feat}: still has {n_missing} missing values")

In [None]:
# Cell 7: Optimization Approach 2 - Linear Correction Model (Using Imputed Data)
# Use imputed data if available
if 'df_imputed' in locals() and 'SRKT_Error_Bio_Imputed' in df_imputed.columns:
    df_ml = df_imputed.copy()
    # Use the imputed error column
    df_ml['SRKT_Error_Bio'] = df_ml['SRKT_Error_Bio_Imputed']
    df_ml['SRKT_Prediction_Bio'] = df_ml['SRKT_Prediction_Bio_Imputed']
    print("Using imputed data for analysis")
else:
    df_ml = df.copy()
    print("Using original data for analysis")

# Prepare features for correction model (Age removed)
feature_cols = ['Posterior Km', 'CCT', 'Post_Ant_Ratio', 'K_Astigmatism_Bio']

# Ensure we have all necessary columns
required_cols = feature_cols + ['SRKT_Error_Bio']
df_ml = df_ml[df_ml[required_cols].notna().all(axis=1)].copy()

print(f"Complete cases for ML: {len(df_ml)} out of {len(df)}")

if len(df_ml) > 10:  # Need at least 10 cases for meaningful analysis
    X = df_ml[feature_cols]
    y = df_ml['SRKT_Error_Bio']
    
    # Split data for cross-validation
    loo = LeaveOneOut()
    predictions_linear = []
    true_values = []
    
    for train_index, test_index in loo.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train linear model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict correction
        correction = model.predict(X_test)[0]
        
        # Apply correction to SRK/T prediction
        srkt_pred = df_ml.iloc[test_index]['SRKT_Prediction_Bio'].values[0]
        corrected_pred = srkt_pred - correction
        
        predictions_linear.append(corrected_pred)
        true_values.append(df_ml.iloc[test_index]['True_IOL'].values[0])
    
    # Calculate performance
    mae_original = df_ml['SRKT_Error_Bio'].abs().mean()
    mae_linear = mean_absolute_error(true_values, predictions_linear)
    
    print(f"Original SRK/T MAE: {mae_original:.3f} D")
    print(f"Linear Correction Model MAE: {mae_linear:.3f} D")
    print(f"Improvement: {mae_original - mae_linear:.3f} D")
    
    # Show feature importance
    model_full = LinearRegression()
    model_full.fit(X, y)
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': model_full.coef_,
        'Abs_Coefficient': np.abs(model_full.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print("\nFeature Importance (Linear Model):")
    print(feature_importance)
else:
    print("Not enough complete cases for machine learning analysis")

In [None]:
# Cell 8: Machine Learning Models Comparison (Using Imputed Data)
if len(df_ml) > 10:
    # Prepare features (Age removed)
    features_ml = ['Bio-AL', 'K_avg_Bio', 'Posterior Km', 'CCT', 'Post_Ant_Ratio', 
                   'K_Astigmatism_Bio', 'A-Constant']
    
    # Ensure complete cases
    df_ml_full = df_ml[features_ml + ['True_IOL']].dropna()
    X_ml = df_ml_full[features_ml]
    y_ml = df_ml_full['True_IOL']
    
    print(f"Cases for ML comparison: {len(X_ml)}")
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    }
    
    # Cross-validation results
    results = {}
    
    for name, model in models.items():
        predictions = []
        actuals = []
        
        for train_idx, test_idx in loo.split(X_ml):
            X_train, X_test = X_ml.iloc[train_idx], X_ml.iloc[test_idx]
            y_train, y_test = y_ml.iloc[train_idx], y_ml.iloc[test_idx]
            
            model.fit(X_train, y_train)
            pred = model.predict(X_test)[0]
            
            predictions.append(pred)
            actuals.append(y_test.values[0])
        
        mae = mean_absolute_error(actuals, predictions)
        results[name] = mae
        print(f"{name} MAE: {mae:.3f} D")
    
    # Compare with original SRK/T
    original_mae = df_ml_full.merge(df_ml[['SRKT_Error_Bio']], left_index=True, right_index=True)['SRKT_Error_Bio'].abs().mean()
    print(f"\nOriginal SRK/T MAE: {original_mae:.3f} D")
    print(f"Best ML Model: {min(results, key=results.get)} with MAE: {min(results.values()):.3f} D")

In [None]:
# Cell 9: Create Final Optimized Formula
if len(df_ml) > 10:
    # Train final model on all data
    best_model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
    best_model.fit(X_ml, y_ml)
    
    # Feature importance
    feature_importance_gb = pd.DataFrame({
        'Feature': features_ml,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_gb['Feature'], feature_importance_gb['Importance'])
    plt.xlabel('Feature Importance')
    plt.title('Feature Importance in Gradient Boosting Model')
    plt.tight_layout()
    plt.show()
    
    print("Feature Importance:")
    print(feature_importance_gb)
    
    # Create simplified correction formula based on top 3 features
    top_features = ['Posterior Km', 'CCT', 'Bio-AL']
    
    # Ensure we have complete data for these features
    df_simple = df_ml[top_features + ['True_IOL', 'SRKT_Prediction_Bio']].dropna()
    X_simple = df_simple[top_features]
    y_correction = df_simple['True_IOL'] - df_simple['SRKT_Prediction_Bio']
    
    model_simple = LinearRegression()
    model_simple.fit(X_simple, y_correction)
    
    print(f"\nSimplified Correction Formula:")
    print(f"Correction = {model_simple.intercept_:.3f}")
    for feat, coef in zip(top_features, model_simple.coef_):
        print(f"           + {coef:.4f} × {feat}")
    
    # Calculate final performance
    df_simple['Predicted_Correction'] = model_simple.predict(X_simple)
    df_simple['Final_Prediction'] = df_simple['SRKT_Prediction_Bio'] + df_simple['Predicted_Correction']
    final_mae = mean_absolute_error(df_simple['True_IOL'], df_simple['Final_Prediction'])
    
    print(f"\nFinal simplified formula MAE: {final_mae:.3f} D")

In [None]:
# Cell 10: Final Validation and Results Summary
if len(df_ml) > 10:
    # Summary statistics
    print("FINAL RESULTS SUMMARY")
    print("="*50)
    print(f"Original data points: {len(df)}")
    print(f"Complete cases for analysis: {len(df_ml)}")
    print(f"\nOriginal SRK/T Performance:")
    print(f"  MAE: {df_ml['SRKT_Error_Bio'].abs().mean():.3f} D")
    print(f"  Mean Error: {df_ml['SRKT_Error_Bio'].mean():.3f} D")
    print(f"  STD: {df_ml['SRKT_Error_Bio'].std():.3f} D")
    
    if 'optimal_nc' in locals():
        print(f"\nOptimized nc: {optimal_nc:.4f}")
    
    print("\n" + "="*50)
    print("RECOMMENDED FORMULA FOR FACODMEK:")
    print("="*50)
    print("Modified_IOL = Standard_SRK/T + Correction")
    print(f"\nWhere Correction = {model_simple.intercept_:.3f} + "
          f"{model_simple.coef_[0]:.4f}×Posterior_Km + "
          f"{model_simple.coef_[1]:.4f}×CCT + "
          f"{model_simple.coef_[2]:.4f}×AL")
    
    # Save results
    results_df = df_ml[['ID', 'Patient', 'Eye', 'Bio-AL', 'K_avg_Bio', 'IOL Power', 
                        'PostOP Spherical Equivalent', 'True_IOL', 'SRKT_Prediction_Bio', 
                        'SRKT_Error_Bio']].copy()
    
    results_df.to_excel('FacoDMEK_Optimization_Results.xlsx', index=False)
    print("\nResults saved to 'FacoDMEK_Optimization_Results.xlsx'")
else:
    print("Insufficient data for complete analysis")