In [None]:
!pip install xgboost lightgbm 

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('../input/mimic3c/mimic3c.csv')
print("With id", data.shape)

In [None]:
print(data.shape)
print(data.head())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('../input/mimic3c/mimic3c.csv')

# Initial exploration
print("Dataset Shape:", data.shape)
print("\n" + "="*50)
print("COLUMN INFORMATION:")
print("="*50)
print(data.info())
print("\n" + "="*50)
print("FIRST 5 ROWS:")
print("="*50)
print(data.head())
print("\n" + "="*50)
print("BASIC STATISTICS:")
print("="*50)
print(data.describe())
print("\n" + "="*50)
print("MISSING VALUES:")
print("="*50)
missing_values = data.isnull().sum()
missing_percent = (missing_values / len(data)) * 100
missing_info = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percent
})
print(missing_info[missing_info['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))
print("\n" + "="*50)
print("UNIQUE VALUES PER COLUMN:")
print("="*50)
for col in data.columns:
    print(f"{col}: {data[col].nunique()} unique values")
print("\n" + "="*50)
print("TARGET VARIABLE DISTRIBUTION (if identifiable):")
print("="*50)
# Check for potential target columns
potential_targets = []
for col in data.columns:
    if data[col].dtype == 'object' and data[col].nunique() <= 10:
        potential_targets.append(col)
    elif data[col].dtype in ['int64', 'float64'] and data[col].nunique() <= 10:
        potential_targets.append(col)

if potential_targets:
    print("Potential target variables:")
    for target in potential_targets:
        print(f"\n{target}:")
        print(data[target].value_counts())
else:
    print("No obvious categorical target variable found. Please specify the target column.")

In [None]:
# Step 2: Comprehensive EDA
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

# Define target variable
target = 'ExpiredHospital'
print(f"TARGET VARIABLE: {target}")
print("="*60)

# Target distribution
print("TARGET DISTRIBUTION:")
target_counts = data[target].value_counts()
target_pct = data[target].value_counts(normalize=True) * 100
print(f"Survived (0): {target_counts[0]} ({target_pct[0]:.2f}%)")
print(f"Expired (1): {target_counts[1]} ({target_pct[1]:.2f}%)")
print(f"Class imbalance ratio: {target_counts[0]/target_counts[1]:.2f}:1")

# Create comprehensive EDA plots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
fig.suptitle('MIMIC-III Dataset - Comprehensive EDA', fontsize=16, fontweight='bold')

# 1. Target distribution
axes[0,0].pie(target_counts.values, labels=['Survived', 'Expired'], autopct='%1.1f%%',
              colors=['lightblue', 'salmon'])
axes[0,0].set_title('Hospital Mortality Distribution')

# 2. Age distribution by target
data.boxplot(column='age', by=target, ax=axes[0,1])
axes[0,1].set_title('Age Distribution by Mortality')
axes[0,1].set_xlabel('Expired Hospital')

# 3. Length of Stay by target
data.boxplot(column='LOSdays', by=target, ax=axes[0,2])
axes[0,2].set_title('Length of Stay by Mortality')
axes[0,2].set_xlabel('Expired Hospital')

# 4. Gender distribution by target
gender_crosstab = pd.crosstab(data['gender'], data[target], normalize='index') * 100
gender_crosstab.plot(kind='bar', ax=axes[1,0], color=['lightblue', 'salmon'])
axes[1,0].set_title('Mortality Rate by Gender')
axes[1,0].set_ylabel('Percentage')
axes[1,0].legend(['Survived', 'Expired'])
axes[1,0].tick_params(axis='x', rotation=0)

# 5. Admit type distribution by target
admit_crosstab = pd.crosstab(data['admit_type'], data[target], normalize='index') * 100
admit_crosstab.plot(kind='bar', ax=axes[1,1], color=['lightblue', 'salmon'])
axes[1,1].set_title('Mortality Rate by Admit Type')
axes[1,1].set_ylabel('Percentage')
axes[1,1].legend(['Survived', 'Expired'])
axes[1,1].tick_params(axis='x', rotation=45)

# 6. Insurance type by target
insurance_crosstab = pd.crosstab(data['insurance'], data[target], normalize='index') * 100
insurance_crosstab.plot(kind='bar', ax=axes[1,2], color=['lightblue', 'salmon'])
axes[1,2].set_title('Mortality Rate by Insurance')
axes[1,2].set_ylabel('Percentage')
axes[1,2].legend(['Survived', 'Expired'])
axes[1,2].tick_params(axis='x', rotation=45)

# 7. Number of diagnoses distribution
axes[2,0].hist([data[data[target]==0]['NumDiagnosis'], data[data[target]==1]['NumDiagnosis']],
               bins=30, alpha=0.7, label=['Survived', 'Expired'], color=['lightblue', 'salmon'])
axes[2,0].set_title('Number of Diagnoses Distribution')
axes[2,0].set_xlabel('Number of Diagnoses')
axes[2,0].set_ylabel('Frequency')
axes[2,0].legend()
axes[2,0].set_xlim(0, 20)

# 8. Total interactions distribution
axes[2,1].hist([data[data[target]==0]['TotalNumInteract'], data[data[target]==1]['TotalNumInteract']],
               bins=30, alpha=0.7, label=['Survived', 'Expired'], color=['lightblue', 'salmon'])
axes[2,1].set_title('Total Interactions Distribution')
axes[2,1].set_xlabel('Total Number of Interactions')
axes[2,1].set_ylabel('Frequency')
axes[2,1].legend()
axes[2,1].set_xlim(0, 2000)

# 9. Chart events distribution
axes[2,2].hist([data[data[target]==0]['NumChartEvents'], data[data[target]==1]['NumChartEvents']],
               bins=30, alpha=0.7, label=['Survived', 'Expired'], color=['lightblue', 'salmon'])
axes[2,2].set_title('Chart Events Distribution')
axes[2,2].set_xlabel('Number of Chart Events')
axes[2,2].set_ylabel('Frequency')
axes[2,2].legend()
axes[2,2].set_xlim(0, 1500)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("KEY INSIGHTS FROM EDA:")
print("="*60)

# Age analysis
age_survived = data[data[target]==0]['age'].mean()
age_expired = data[data[target]==1]['age'].mean()
print(f"Average age - Survived: {age_survived:.1f}, Expired: {age_expired:.1f}")

# LOS analysis
los_survived = data[data[target]==0]['LOSdays'].mean()
los_expired = data[data[target]==1]['LOSdays'].mean()
print(f"Average LOS - Survived: {los_survived:.1f} days, Expired: {los_expired:.1f} days")

# Numerical features correlation with target
print("\nCORRELATION WITH TARGET (ExpiredHospital):")
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove(target)
numerical_cols.remove('hadm_id')  # Remove ID column

correlations = data[numerical_cols + [target]].corr()[target].sort_values(key=abs, ascending=False)[1:]
print(correlations.head(10))

# Missing values impact
print(f"\nMISSING VALUES ANALYSIS:")
print(f"Marital Status missing: {data['marital_status'].isnull().sum()} ({data['marital_status'].isnull().sum()/len(data)*100:.1f}%)")
print(f"Religion missing: {data['religion'].isnull().sum()} ({data['religion'].isnull().sum()/len(data)*100:.1f}%)")
print(f"Admit Diagnosis missing: {data['AdmitDiagnosis'].isnull().sum()} ({data['AdmitDiagnosis'].isnull().sum()/len(data)*100:.1f}%)")

# High-level summary
print(f"\nDATASET SUMMARY:")
print(f"- Total samples: {len(data):,}")
print(f"- Features: {len(data.columns)-1}")
print(f"- Target: Hospital Mortality (Binary)")
print(f"- Class distribution: {target_pct[0]:.1f}% survived, {target_pct[1]:.1f}% expired")
print(f"- Imbalance ratio: {target_counts[0]/target_counts[1]:.1f}:1 (moderate imbalance)")

# **Default Models**

In [None]:
# Complete MIMIC-III Classification Analysis Pipeline
# This script includes data loading, preprocessing, and the classification models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                           roc_curve, precision_recall_curve, average_precision_score,
                           f1_score, precision_score, recall_score, accuracy_score)
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
print("MIMIC-III COMPLETE CLASSIFICATION ANALYSIS PIPELINE")
print("="*80)

# STEP 1: DATA LOADING AND EXPLORATION
print("STEP 1: DATA LOADING AND EXPLORATION")
print("-" * 40)

# Load the dataset - change this path to match your data location
# Common paths: 'mimic3c.csv', 'data/mimic3c.csv', '../input/mimic3c/mimic3c.csv'
data = pd.read_csv('../input/mimic3c/mimic3c.csv')  # Update this path as needed
# Alternative: data = pd.read_csv('mimic3c.csv')  # If file is in current directory
print(f"Dataset loaded successfully!")
print(f"Shape: {data.shape}")
print(f"Columns: {list(data.columns)}")

# Display first few rows
print("\nFirst 5 rows:")
print(data.head())

# Basic information about the dataset
print(f"\nDataset Info:")
print(data.info())

# Check for missing values
print(f"\nMissing Values:")
missing_info = data.isnull().sum()
missing_percent = (missing_info / len(data)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing_info,
    'Missing_Percentage': missing_percent
})
print(missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False))

# STEP 2: FEATURE SELECTION AND TARGET PREPARATION
print(f"\n" + "="*80)
print("STEP 2: FEATURE SELECTION AND TARGET PREPARATION")
print("-" * 40)

# Select features for modeling (as specified)
feature_columns = [
    # Demographics
    'age', 'gender_encoded', 'ethnicity_encoded',
    # Admission details
    'admit_type_encoded', 'admit_location_encoded', 'insurance_encoded',
    'religion_encoded', 'marital_status_encoded',
    # Clinical features
    'LOSdays', 'NumDiagnosis', 'NumProcs', 'NumCallouts',
    'NumCPTevents', 'NumInput', 'NumLabs', 'NumMicroLabs',
    'NumNotes', 'NumOutput', 'NumRx', 'NumProcEvents',
    'NumTransfers', 'NumChartEvents', 'TotalNumInteract',
    # Encoded diagnosis and procedure
    'AdmitDiagnosis_encoded', 'AdmitProcedure_encoded',
    # Engineered features
    'age_group_encoded', 'los_category_encoded',
    'lab_intensity', 'med_intensity', 'procedure_intensity', 'monitoring_intensity',
    'clinical_burden', 'interaction_density',
    'high_age', 'high_los', 'high_diagnoses', 'emergency_admit', 'icu_indicator'
]

# Target variable
target_column = 'ExpiredHospital'

print(f"✅ Target variable: {target_column}")
print(f"📊 Total features selected: {len(feature_columns)}")

# Check if all required columns exist in the dataset
missing_features = [col for col in feature_columns if col not in data.columns]
missing_target = target_column not in data.columns

if missing_features:
    print(f"⚠️  Missing feature columns: {missing_features}")
    print("Available columns in dataset:")
    print(list(data.columns))
    print("\nFiltering to only available features...")
    feature_columns = [col for col in feature_columns if col in data.columns]
    print(f"✅ Using {len(feature_columns)} available features")

if missing_target:
    print(f"❌ Target column '{target_column}' not found in dataset!")
    print("Available columns:", list(data.columns))
    print("No")

# Create feature matrix and target vector
X = data[feature_columns].copy()
y = data[target_column].copy()

print(f"\n📊 FEATURE CATEGORIES:")
print("-" * 30)

# Categorize features for better understanding
demographics = ['age', 'gender_encoded', 'ethnicity_encoded']
admission_details = ['admit_type_encoded', 'admit_location_encoded', 'insurance_encoded', 
                    'religion_encoded', 'marital_status_encoded']
clinical_features = ['LOSdays', 'NumDiagnosis', 'NumProcs', 'NumCallouts', 'NumCPTevents', 
                    'NumInput', 'NumLabs', 'NumMicroLabs', 'NumNotes', 'NumOutput', 'NumRx', 
                    'NumProcEvents', 'NumTransfers', 'NumChartEvents', 'TotalNumInteract']
encoded_features = ['AdmitDiagnosis_encoded', 'AdmitProcedure_encoded']
engineered_features = ['age_group_encoded', 'los_category_encoded', 'lab_intensity', 
                      'med_intensity', 'procedure_intensity', 'monitoring_intensity',
                      'clinical_burden', 'interaction_density', 'high_age', 'high_los', 
                      'high_diagnoses', 'emergency_admit', 'icu_indicator']

print(f"Demographics ({len([f for f in demographics if f in feature_columns])}): {[f for f in demographics if f in feature_columns]}")
print(f"Admission Details ({len([f for f in admission_details if f in feature_columns])}): {[f for f in admission_details if f in feature_columns]}")
print(f"Clinical Features ({len([f for f in clinical_features if f in feature_columns])}): {[f for f in clinical_features if f in feature_columns]}")
print(f"Encoded Features ({len([f for f in encoded_features if f in feature_columns])}): {[f for f in encoded_features if f in feature_columns]}")
print(f"Engineered Features ({len([f for f in engineered_features if f in feature_columns])}): {[f for f in engineered_features if f in feature_columns]}")

print(f"\n📈 DATA SUMMARY:")
print(f"   Features shape: {X.shape}")
print(f"   Target shape: {y.shape}")
print(f"   Target distribution:\n{y.value_counts()}")

# Check for missing values in selected features
missing_in_features = X.isnull().sum()
if missing_in_features.sum() > 0:
    print(f"\n⚠️  Missing values in features:")
    print(missing_in_features[missing_in_features > 0])
    
    # Simple imputation for missing values
    print("Applying simple imputation...")
    X = X.fillna(X.median())
    print("✅ Missing values filled with median")

print(f"\n✅ Feature matrix and target ready for modeling!")
print(f"   Final X shape: {X.shape}")
print(f"   Final y shape: {y.shape}")
print(f"   Target classes: {sorted(y.unique())}")

# STEP 4: BASIC CLASSIFICATION MODELS WITH DEFAULT PARAMETERS
print(f"\n" + "="*80)
print("STEP 4: BASIC CLASSIFICATION MODELS WITH DEFAULT PARAMETERS")
print("="*80)

# 1. TRAIN-TEST SPLIT (STRATIFIED)
print("1. SPLITTING DATA...")
print("-" * 40)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training class distribution: {y_train.value_counts().to_dict()}")
print(f"Test class distribution: {y_test.value_counts().to_dict()}")


print("-" * 40)


X_train_scaled =(X_train)
X_test_scaled = (X_test)



# 3. DEFINE MODELS WITH DEFAULT PARAMETERS
print("\n3. DEFINING MODELS WITH DEFAULT PARAMETERS...")
print("-" * 40)

# All models with completely default parameters
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Neural Network (MLP)': MLPClassifier(random_state=42, max_iter=200),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}
print("Models defined with default parameters only (Neural Network max_iter=200 to ensure convergence)")

# 4. TRAIN MODELS AND COLLECT RESULTS
model_results = {}
trained_models = {}

# Create figure for ROC curves
plt.figure(figsize=(10, 8))
colors = ['blue', 'green', 'red', 'purple', 'orange', 'cyan', 'magenta', 'brown', 'black']


print("\n" + "="*80)
print("MODEL TRAINING AND EVALUATION RESULTS")
print("="*80)

for i, (name, model) in enumerate(models.items()):
    print(f"\n🔹 TRAINING: {name}")
    print("-" * 60)
    
    # Train model with default parameters on scaled data
    model.fit(X_train_scaled, y_train)
    
    # Cross-validation on training set
    cv_scores = cross_val_score(model, X_train_scaled, y_train, 
                               cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                               scoring='roc_auc', n_jobs=-1)
    
    # Predictions on test set
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate all metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    model_results[name] = {
        'CV_AUC_Mean': cv_scores.mean(),
        'CV_AUC_Std': cv_scores.std(),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1,
        'AUC': auc_score,
        'Predictions': y_pred,
        'Probabilities': y_pred_proba
    }
    
    trained_models[name] = model
    
    # Print detailed metrics for this model
    print(f"📊 PERFORMANCE METRICS:")
    print(f"   ✅ Accuracy:     {accuracy:.4f}")
    print(f"   ✅ Precision:    {precision:.4f}")
    print(f"   ✅ Recall:       {recall:.4f}")
    print(f"   ✅ F1-Score:     {f1:.4f}")
    print(f"   ✅ AUC-ROC:      {auc_score:.4f}")
    print(f"   ✅ CV AUC:       {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    
    # Calculate and plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    colors = ['blue', 'green', 'red', 'purple', 'orange', 'cyan', 'magenta', 'brown', 'black']
    # Plot ROC curve
    plt.plot(fpr, tpr, linewidth=2, color=colors[i], 
             label=f'{name} (AUC: {auc_score:.3f})')

# Finalize ROC plot
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Default Parameter Models', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 5. MODEL COMPARISON TABLE
print("\n" + "="*80)
print("MODEL COMPARISON - DEFAULT PARAMETERS")
print("="*80)

# Create comparison DataFrame
comparison_df = pd.DataFrame(model_results).T
comparison_df = comparison_df[['Accuracy', 'Precision', 'Recall', 'F1_Score', 'AUC', 'CV_AUC_Mean']].round(4)

print(comparison_df.to_string())

# Find best model for each metric
print(f"\n🏆 BEST PERFORMERS BY METRIC:")
print(f"   🥇 Best Accuracy:  {comparison_df['Accuracy'].idxmax()} ({comparison_df['Accuracy'].max():.4f})")
print(f"   🥇 Best Precision: {comparison_df['Precision'].idxmax()} ({comparison_df['Precision'].max():.4f})")
print(f"   🥇 Best Recall:    {comparison_df['Recall'].idxmax()} ({comparison_df['Recall'].max():.4f})")
print(f"   🥇 Best F1-Score:  {comparison_df['F1_Score'].idxmax()} ({comparison_df['F1_Score'].max():.4f})")
print(f"   🥇 Best AUC:       {comparison_df['AUC'].idxmax()} ({comparison_df['AUC'].max():.4f})")

# Overall best model (based on AUC)
best_model_name = comparison_df['AUC'].idxmax()
print(f"\n🎯 OVERALL BEST MODEL: {best_model_name}")
print(f"   📈 AUC Score: {comparison_df.loc[best_model_name, 'AUC']:.4f}")

# 6. DETAILED EVALUATION OF BEST MODEL
print(f"\n" + "="*80)
print(f"DETAILED ANALYSIS - {best_model_name}")
print("="*80)

best_predictions = model_results[best_model_name]['Predictions']
best_probabilities = model_results[best_model_name]['Probabilities']

# Classification Report
print("📋 CLASSIFICATION REPORT:")
target_names = ['Class 0', 'Class 1']  # Adjust based on your target
print(classification_report(y_test, best_predictions, target_names=target_names, digits=4))

# Confusion Matrix
print("\n📊 CONFUSION MATRIX:")
cm = confusion_matrix(y_test, best_predictions)
print(cm)

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
npv = tn / (tn + fn) if (tn + fn) > 0 else 0

print(f"\n📈 ADDITIONAL METRICS:")
print(f"   ✅ Sensitivity (True Positive Rate): {sensitivity:.4f}")
print(f"   ✅ Specificity (True Negative Rate): {specificity:.4f}")
print(f"   ✅ Positive Predictive Value (PPV):  {ppv:.4f}")
print(f"   ✅ Negative Predictive Value (NPV):  {npv:.4f}")

# 7. VISUALIZATION - CONFUSION MATRIX HEATMAP
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# 8. FINAL SUMMARY
print(f"\n" + "="*80)
print("FINAL SUMMARY - DEFAULT PARAMETER MODELS")
print("="*80)

# Create a summary table
summary_data = []
for model_name, results in model_results.items():
    summary_data.append({
        'Model': model_name,
        'Accuracy': f"{results['Accuracy']:.4f}",
        'Precision': f"{results['Precision']:.4f}",
        'Recall': f"{results['Recall']:.4f}",
        'F1-Score': f"{results['F1_Score']:.4f}",
        'AUC-ROC': f"{results['AUC']:.4f}",
        'CV AUC': f"{results['CV_AUC_Mean']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n📝 MODEL PARAMETERS USED:")
for name, model in trained_models.items():
    print(f"\n{name}:")
    print(f"   Parameters: {model.get_params()}")

print(f"\n🎯 SUMMARY:")
print(f"   📊 Total models trained: {len(model_results)}")
print(f"   🏆 Best model: {best_model_name}")
print(f"   📈 Best AUC: {comparison_df['AUC'].max():.4f}")
print(f"   ⚙️  All models used default parameters")
print(f"   🚫 No class balancing or optimization applied")
print(f"   📋 All standard metrics calculated and displayed")
print(f"   🎲 Target variable: {target_column}")
print(f"   📊 Dataset: {data.shape[0]} samples, {X.shape[1]} features")

print(f"\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)

# **Improvement**

In [None]:
# Step 3: Data Preprocessing and Feature Engineering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

print("STEP 3: DATA PREPROCESSING & FEATURE ENGINEERING")
print("="*60)

# Create a copy for preprocessing
df = data.copy()
print(f"Original dataset shape: {df.shape}")

# 1. HANDLE MISSING VALUES
print("\n1. HANDLING MISSING VALUES...")
print("-" * 40)

# Fill missing marital_status with mode
df['marital_status'].fillna(df['marital_status'].mode()[0], inplace=True)

# Fill missing religion with 'UNKNOWN'
df['religion'].fillna('UNKNOWN', inplace=True)

# Fill missing AdmitDiagnosis with mode
df['AdmitDiagnosis'].fillna(df['AdmitDiagnosis'].mode()[0], inplace=True)

print("Missing values after imputation:")
print(df.isnull().sum().sum())

# 2. FEATURE ENGINEERING
print("\n2. FEATURE ENGINEERING...")
print("-" * 40)

# Age groups (clinical relevance)
df['age_group'] = pd.cut(df['age'],
                        bins=[0, 18, 35, 50, 65, 80, 100],
                        labels=['pediatric', 'young_adult', 'adult', 'middle_aged', 'elderly', 'very_elderly'])

# LOS categories
df['los_category'] = pd.cut(df['LOSdays'],
                           bins=[0, 2, 7, 14, 30, 300],
                           labels=['very_short', 'short', 'medium', 'long', 'very_long'])

# Clinical activity intensity features
df['lab_intensity'] = df['NumLabs'] / (df['LOSdays'] + 1)  # +1 to avoid division by zero
df['med_intensity'] = df['NumRx'] / (df['LOSdays'] + 1)
df['procedure_intensity'] = df['NumProcs'] / (df['LOSdays'] + 1)
df['monitoring_intensity'] = df['NumChartEvents'] / (df['LOSdays'] + 1)

# Total clinical burden score
df['clinical_burden'] = (df['NumDiagnosis'] + df['NumProcs'] + df['NumCallouts'])

# Interaction density
df['interaction_density'] = df['TotalNumInteract'] / (df['LOSdays'] + 1)

# High-risk indicators
df['high_age'] = (df['age'] >= 70).astype(int)
df['high_los'] = (df['LOSdays'] >= 14).astype(int)
df['high_diagnoses'] = (df['NumDiagnosis'] >= df['NumDiagnosis'].quantile(0.75)).astype(int)
df['emergency_admit'] = (df['admit_type'] == 'EMERGENCY').astype(int)

# ICU indicators (based on high monitoring)
df['icu_indicator'] = (df['NumChartEvents'] >= df['NumChartEvents'].quantile(0.90)).astype(int)

print(f"New features created: {len(df.columns) - len(data.columns)}")
print("New feature list:")
new_features = [col for col in df.columns if col not in data.columns]
for feature in new_features:
    print(f"  - {feature}")

# 3. ENCODE CATEGORICAL VARIABLES
print("\n3. ENCODING CATEGORICAL VARIABLES...")
print("-" * 40)

# Initialize label encoders
label_encoders = {}

# Categorical columns to encode
categorical_cols = ['gender', 'admit_type', 'admit_location', 'insurance',
                   'religion', 'marital_status', 'ethnicity', 'age_group', 'los_category']

# Apply label encoding
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"  {col}: {len(le.classes_)} categories -> encoded")

# 4. HANDLE HIGH CARDINALITY FEATURES
print("\n4. HANDLING HIGH CARDINALITY FEATURES...")
print("-" * 40)

# AdmitDiagnosis and AdmitProcedure have too many unique values
# Create frequency-based encoding
diagnosis_freq = df['AdmitDiagnosis'].value_counts()
procedure_freq = df['AdmitProcedure'].value_counts()

# Keep top 50 most frequent, group others as 'OTHER'
top_diagnoses = diagnosis_freq.head(50).index
top_procedures = procedure_freq.head(50).index

df['AdmitDiagnosis_grouped'] = df['AdmitDiagnosis'].apply(
    lambda x: x if x in top_diagnoses else 'OTHER'
)
df['AdmitProcedure_grouped'] = df['AdmitProcedure'].apply(
    lambda x: x if x in top_procedures else 'OTHER'
)

# Encode the grouped versions
le_diag = LabelEncoder()
le_proc = LabelEncoder()
df['AdmitDiagnosis_encoded'] = le_diag.fit_transform(df['AdmitDiagnosis_grouped'])
df['AdmitProcedure_encoded'] = le_proc.fit_transform(df['AdmitProcedure_grouped'])

print(f"  AdmitDiagnosis: {len(diagnosis_freq)} -> {len(le_diag.classes_)} categories")
print(f"  AdmitProcedure: {len(procedure_freq)} -> {len(le_proc.classes_)} categories")

# 5. FEATURE SELECTION FOR MODELING
print("\n5. PREPARING FEATURE SET...")
print("-" * 40)

# Select features for modeling
feature_columns = [
    # Demographics
    'age', 'gender_encoded', 'ethnicity_encoded',

    # Admission details
    'admit_type_encoded', 'admit_location_encoded', 'insurance_encoded',
    'religion_encoded', 'marital_status_encoded',

    # Clinical features
    'LOSdays', 'NumDiagnosis', 'NumProcs', 'NumCallouts',
    'NumCPTevents', 'NumInput', 'NumLabs', 'NumMicroLabs',
    'NumNotes', 'NumOutput', 'NumRx', 'NumProcEvents',
    'NumTransfers', 'NumChartEvents', 'TotalNumInteract',

    # Encoded diagnosis and procedure
    'AdmitDiagnosis_encoded', 'AdmitProcedure_encoded',

    # Engineered features
    'age_group_encoded', 'los_category_encoded',
    'lab_intensity', 'med_intensity', 'procedure_intensity', 'monitoring_intensity',
    'clinical_burden', 'interaction_density',
    'high_age', 'high_los', 'high_diagnoses', 'emergency_admit', 'icu_indicator'
]

# Create feature matrix
X = df[feature_columns].copy()
y = df['ExpiredHospital'].copy()

print(f"Final feature set: {X.shape[1]} features")
print(f"Target distribution: {y.value_counts().to_dict()}")

# 6. HANDLE OUTLIERS (using IQR method for key continuous features)
print("\n6. OUTLIER HANDLING...")
print("-" * 40)

continuous_features = ['LOSdays', 'NumDiagnosis', 'NumInput', 'NumLabs', 'NumChartEvents',
                      'TotalNumInteract', 'lab_intensity', 'med_intensity', 'monitoring_intensity']

outlier_stats = {}
for feature in continuous_features:
    Q1 = X[feature].quantile(0.25)
    Q3 = X[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers_before = ((X[feature] < lower_bound) | (X[feature] > upper_bound)).sum()

    # Cap outliers instead of removing them
    X[feature] = np.clip(X[feature], lower_bound, upper_bound)

    outlier_stats[feature] = outliers_before

print("Outliers capped per feature:")
for feature, count in outlier_stats.items():
    if count > 0:
        print(f"  {feature}: {count} outliers capped")

# 7. FINAL DATASET INFO
print(f"\n7. FINAL PREPROCESSED DATASET:")
print("-" * 40)
print(f"Shape: {X.shape}")
print(f"Features: {list(X.columns)}")
print(f"Target class distribution:")
print(f"  Survived (0): {(y==0).sum()} ({(y==0).sum()/len(y)*100:.1f}%)")
print(f"  Expired (1): {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")
print(f"Missing values in X: {X.isnull().sum().sum()}")
print(f"Missing values in y: {y.isnull().sum()}")

# Check for any remaining issues
print(f"\nData quality check:")
print(f"- Infinite values: {np.isinf(X).sum().sum()}")
print(f"- NaN values: {X.isnull().sum().sum()}")
print(f"- Feature dtypes: {X.dtypes.value_counts().to_dict()}")

print("\nPreprocessing completed successfully!")
print("Ready for model training...")

In [None]:
##############################

In [None]:
# Step 4: High-Performance Classification Model (Enhanced with Detailed Metrics)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                           roc_curve, precision_recall_curve, average_precision_score,
                           f1_score, precision_score, recall_score, accuracy_score)
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("STEP 4: HIGH-PERFORMANCE CLASSIFICATION MODEL")
print("="*60)

# 1. TRAIN-TEST SPLIT (STRATIFIED)
print("1. SPLITTING DATA...")
print("-" * 40)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training class distribution: {y_train.value_counts().to_dict()}")
print(f"Test class distribution: {y_test.value_counts().to_dict()}")

# 2. FEATURE SCALING
print("\n2. FEATURE SCALING...")
print("-" * 40)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled using StandardScaler")

# 3. CALCULATE CLASS WEIGHTS FOR IMBALANCE HANDLING
print("\n3. HANDLING CLASS IMBALANCE...")
print("-" * 40)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
scale_pos_weight = class_weights[1] / class_weights[0]

print(f"Class weights: {class_weight_dict}")
print(f"Scale pos weight: {scale_pos_weight:.3f}")

# Manual oversampling function (alternative to SMOTE)
def manual_oversample(X, y, random_state=42):
    """Simple random oversampling to balance classes"""
    np.random.seed(random_state)
    
    # Separate majority and minority classes
    majority_class = 0
    minority_class = 1
    
    X_majority = X[y == majority_class]
    X_minority = X[y == minority_class]
    y_majority = y[y == majority_class]
    y_minority = y[y == minority_class]
    
    # Calculate how many samples to add
    n_majority = len(X_majority)
    n_minority = len(X_minority)
    n_to_add = n_majority - n_minority
    
    # Randomly sample from minority class with replacement
    indices = np.random.choice(len(X_minority), size=n_to_add, replace=True)
    X_minority_upsampled = X_minority[indices]
    y_minority_upsampled = y_minority.iloc[indices]
    
    # Combine majority and upsampled minority
    X_balanced = np.vstack([X_majority, X_minority, X_minority_upsampled])
    y_balanced = np.hstack([y_majority, y_minority, y_minority_upsampled])
    
    return X_balanced, y_balanced

# Apply manual oversampling
X_train_balanced, y_train_balanced = manual_oversample(X_train_scaled, y_train, random_state=42)

print(f"Before oversampling: {pd.Series(y_train).value_counts().to_dict()}")
print(f"After oversampling: {pd.Series(y_train_balanced).value_counts().to_dict()}")

# 4. MODEL TRAINING WITH MULTIPLE ALGORITHMS
print("\n4. TRAINING MULTIPLE MODELS...")
print("-" * 40)

# Define models with class-aware configurations
models = {
    'Logistic Regression': LogisticRegression(
        random_state=42, 
        max_iter=1000,
        class_weight='balanced',
        C=0.1
    ),
    
    'Random Forest (Balanced)': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ),
    
    'Random Forest (Oversampled)': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )
}

# Train models and collect results
model_results = {}
trained_models = {}

# Create figure for ROC curves
plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown']
roc_data = {}

print("\n" + "="*80)
print("DETAILED MODEL PERFORMANCE RESULTS")
print("="*80)

for i, (name, model) in enumerate(models.items()):
    print(f"\n🔹 TRAINING: {name}")
    print("-" * 60)
    
    # Use different data based on model type
    if 'Oversampled' in name:
        X_train_use, y_train_use = X_train_balanced, y_train_balanced
    else:
        X_train_use, y_train_use = X_train_scaled, y_train
    
    # Train model
    model.fit(X_train_use, y_train_use)
    
    # Cross-validation on training set
    cv_scores = cross_val_score(model, X_train_use, y_train_use, 
                               cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                               scoring='roc_auc', n_jobs=-1)
    
    # Predictions on test set
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate all metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)
    
    # Store results
    model_results[name] = {
        'CV_AUC_Mean': cv_scores.mean(),
        'CV_AUC_Std': cv_scores.std(),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1,
        'AUC': auc_score,
        'Avg_Precision': avg_precision,
        'Predictions': y_pred,
        'Probabilities': y_pred_proba
    }
    
    trained_models[name] = model
    
    # Print detailed metrics for this model
    print(f"📊 PERFORMANCE METRICS:")
    print(f"   ✅ Accuracy:     {accuracy:.4f}")
    print(f"   ✅ Precision:    {precision:.4f}")
    print(f"   ✅ Recall:       {recall:.4f}")
    print(f"   ✅ F1-Score:     {f1:.4f}")
    print(f"   ✅ AUC-ROC:      {auc_score:.4f}")
    print(f"   ✅ CV AUC:       {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    
    # Calculate and store ROC curve data
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_data[name] = {'fpr': fpr, 'tpr': tpr, 'auc': auc_score}
    
    # Plot ROC curve
    plt.plot(fpr, tpr, linewidth=2, color=colors[i % len(colors)], 
             label=f'{name} (AUC: {auc_score:.3f})')

# 5. CREATE ENSEMBLE MODEL
print(f"\n🔹 TRAINING: Ensemble Model")
print("-" * 60)

# Get predictions from top 3 models
ensemble_probabilities = np.zeros(len(y_test))
top_3_models = sorted(model_results.items(), key=lambda x: x[1]['AUC'], reverse=True)[:3]

print(f"📋 Top 3 models for ensemble:")
for rank, (name, results) in enumerate(top_3_models, 1):
    print(f"   {rank}. {name} (AUC: {results['AUC']:.4f})")

weights = [0.4, 0.35, 0.25]  # Weighted ensemble
for i, (name, results) in enumerate(top_3_models):
    ensemble_probabilities += weights[i] * results['Probabilities']

ensemble_predictions = (ensemble_probabilities > 0.5).astype(int)

# Calculate ensemble metrics
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
ensemble_precision = precision_score(y_test, ensemble_predictions)
ensemble_recall = recall_score(y_test, ensemble_predictions)
ensemble_f1 = f1_score(y_test, ensemble_predictions)
ensemble_auc = roc_auc_score(y_test, ensemble_probabilities)
ensemble_avg_precision = average_precision_score(y_test, ensemble_probabilities)

model_results['Ensemble (Top 3)'] = {
    'CV_AUC_Mean': np.mean([r['CV_AUC_Mean'] for _, r in top_3_models]),
    'CV_AUC_Std': np.mean([r['CV_AUC_Std'] for _, r in top_3_models]),
    'Accuracy': ensemble_accuracy,
    'Precision': ensemble_precision,
    'Recall': ensemble_recall,
    'F1_Score': ensemble_f1,
    'AUC': ensemble_auc,
    'Avg_Precision': ensemble_avg_precision,
    'Predictions': ensemble_predictions,
    'Probabilities': ensemble_probabilities
}

print(f"📊 ENSEMBLE PERFORMANCE METRICS:")
print(f"   ✅ Accuracy:     {ensemble_accuracy:.4f}")
print(f"   ✅ Precision:    {ensemble_precision:.4f}")
print(f"   ✅ Recall:       {ensemble_recall:.4f}")
print(f"   ✅ F1-Score:     {ensemble_f1:.4f}")
print(f"   ✅ AUC-ROC:      {ensemble_auc:.4f}")

# Add ensemble to ROC plot
fpr_ensemble, tpr_ensemble, _ = roc_curve(y_test, ensemble_probabilities)
plt.plot(fpr_ensemble, tpr_ensemble, linewidth=3, color='black', linestyle='--',
         label=f'Ensemble (AUC: {ensemble_auc:.3f})')

# Finalize ROC plot
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - All Models Comparison', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 6. COMPREHENSIVE MODEL COMPARISON TABLE
print("\n" + "="*80)
print("COMPREHENSIVE MODEL COMPARISON")
print("="*80)

# Create detailed comparison DataFrame
comparison_df = pd.DataFrame(model_results).T
comparison_df = comparison_df[['Accuracy', 'Precision', 'Recall', 'F1_Score', 'AUC', 'CV_AUC_Mean']].round(4)

print(comparison_df.to_string())

# Find best model for each metric
print(f"\n🏆 BEST PERFORMERS BY METRIC:")
print(f"   🥇 Best Accuracy:  {comparison_df['Accuracy'].idxmax()} ({comparison_df['Accuracy'].max():.4f})")
print(f"   🥇 Best Precision: {comparison_df['Precision'].idxmax()} ({comparison_df['Precision'].max():.4f})")
print(f"   🥇 Best Recall:    {comparison_df['Recall'].idxmax()} ({comparison_df['Recall'].max():.4f})")
print(f"   🥇 Best F1-Score:  {comparison_df['F1_Score'].idxmax()} ({comparison_df['F1_Score'].max():.4f})")
print(f"   🥇 Best AUC:       {comparison_df['AUC'].idxmax()} ({comparison_df['AUC'].max():.4f})")

# Overall best model (based on AUC)
best_model_name = comparison_df['AUC'].idxmax()
print(f"\n🎯 OVERALL BEST MODEL: {best_model_name}")
print(f"   📈 AUC Score: {comparison_df.loc[best_model_name, 'AUC']:.4f}")

# 7. DETAILED EVALUATION OF BEST MODEL
print(f"\n" + "="*80)
print(f"DETAILED ANALYSIS - {best_model_name}")
print("="*80)

best_predictions = model_results[best_model_name]['Predictions']
best_probabilities = model_results[best_model_name]['Probabilities']

# Classification Report
print("📋 CLASSIFICATION REPORT:")
print(classification_report(y_test, best_predictions, target_names=['Survived', 'Expired'], digits=4))

# Confusion Matrix
print("\n📊 CONFUSION MATRIX:")
cm = confusion_matrix(y_test, best_predictions)
print(cm)

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
npv = tn / (tn + fn) if (tn + fn) > 0 else 0

print(f"\n📈 ADDITIONAL CLINICAL METRICS:")
print(f"   ✅ Sensitivity (True Positive Rate): {sensitivity:.4f}")
print(f"   ✅ Specificity (True Negative Rate): {specificity:.4f}")
print(f"   ✅ Positive Predictive Value (PPV):  {ppv:.4f}")
print(f"   ✅ Negative Predictive Value (NPV):  {npv:.4f}")

# 8. SUMMARY TABLE OF ALL METRICS
print(f"\n" + "="*80)
print("FINAL SUMMARY - ALL MODELS WITH ALL METRICS")
print("="*80)

# Create a more comprehensive summary
summary_data = []
for model_name, results in model_results.items():
    summary_data.append({
        'Model': model_name,
        'Accuracy': f"{results['Accuracy']:.4f}",
        'Precision': f"{results['Precision']:.4f}",
        'Recall': f"{results['Recall']:.4f}",
        'F1-Score': f"{results['F1_Score']:.4f}",
        'AUC-ROC': f"{results['AUC']:.4f}",
        'CV AUC': f"{results['CV_AUC_Mean']:.4f}"
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n🚀 MODELS READY FOR DEPLOYMENT!")
print(f"   📊 Total models trained: {len(model_results)}")
print(f"   🎯 Best overall model: {best_model_name}")
print(f"   📈 Best AUC achieved: {comparison_df['AUC'].max():.4f}")
print(f"   🔄 All metrics calculated and displayed above")

# Store ROC curve data for potential future use
print(f"\n💾 ROC curve data stored for all models")
print(f"   📈 Use 'roc_data' dictionary to access FPR, TPR, and AUC for each model")

# **Hyperparamter**

In [None]:
# Comprehensive Healthcare ML Pipeline - MIMIC-III Dataset Analysis
# Fixed and Enhanced Version with All Required Metrics and Visualizations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
)

# Import all required models
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    VotingClassifier, AdaBoostClassifier, BaggingClassifier
)
import xgboost as xgb
import lightgbm as lgb

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("=" * 100)
print("COMPREHENSIVE HEALTHCARE ML PIPELINE - MIMIC-III DATASET ANALYSIS")
print("=" * 100)

# ============================================================================
# STEP 1: DATA LOADING AND EXPLORATION
# ============================================================================
print("\nSTEP 1: DATA LOADING AND EXPLORATION")
print("-" * 60)

try:
    # Load the dataset - update path as needed
    data = pd.read_csv('../input/mimic3c/mimic3c.csv')  # Update this path as needed
    print(f"✓ Dataset loaded successfully!")
    print(f"✓ Shape: {data.shape}")
    print(f"✓ Columns: {list(data.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows preview:")
    print(data.head())
    
    # Basic statistics
    print(f"\nDataset Info:")
    print(f"- Total samples: {len(data)}")
    print(f"- Total features: {len(data.columns)}")
    print(f"- Missing values: {data.isnull().sum().sum()}")
    
    # Target variable distribution
    if 'ExpiredHospital' in data.columns:
        target_dist = data['ExpiredHospital'].value_counts()
        print(f"\nTarget Distribution (ExpiredHospital):")
        print(f"- Survived (0): {target_dist.get(0, 0)} ({target_dist.get(0, 0)/len(data)*100:.1f}%)")
        print(f"- Expired (1): {target_dist.get(1, 0)} ({target_dist.get(1, 0)/len(data)*100:.1f}%)")
    
except FileNotFoundError:
    print("❌ Error: Dataset file not found!")
    print("Please update the file path in the code to match your data location.")
    print("Common paths: 'mimic3c.csv', 'data/mimic3c.csv', '../input/mimic3c/mimic3c.csv'")
    exit()

# ============================================================================
# STEP 2: DATA PREPROCESSING
# ============================================================================
print("\n" + "=" * 100)
print("STEP 2: DATA PREPROCESSING")
print("=" * 100)

# Create a copy for preprocessing
df = data.copy()
print(f"Original dataset shape: {df.shape}")

# 1. HANDLE MISSING VALUES
print("\n1. HANDLING MISSING VALUES...")
print("-" * 40)

missing_before = df.isnull().sum()
print("Missing values before treatment:")
for col in missing_before[missing_before > 0].index:
    print(f"  {col}: {missing_before[col]} ({missing_before[col]/len(df)*100:.1f}%)")

# Fill missing values based on column type and nature
if 'marital_status' in df.columns:
    df['marital_status'].fillna(df['marital_status'].mode()[0] if not df['marital_status'].mode().empty else 'UNKNOWN', inplace=True)

if 'religion' in df.columns:
    df['religion'].fillna('UNKNOWN', inplace=True)

if 'AdmitDiagnosis' in df.columns:
    df['AdmitDiagnosis'].fillna(df['AdmitDiagnosis'].mode()[0] if not df['AdmitDiagnosis'].mode().empty else 'UNKNOWN', inplace=True)

print(f"✓ Missing values after imputation: {df.isnull().sum().sum()}")

# 2. FEATURE ENGINEERING
print("\n2. FEATURE ENGINEERING...")
print("-" * 40)

# Age groups (clinical relevance)
if 'age' in df.columns:
    df['age_group'] = pd.cut(df['age'],
                            bins=[0, 18, 35, 50, 65, 80, 100],
                            labels=['pediatric', 'young_adult', 'adult', 'middle_aged', 'elderly', 'very_elderly'])

# LOS categories
if 'LOSdays' in df.columns:
    df['los_category'] = pd.cut(df['LOSdays'],
                               bins=[0, 2, 7, 14, 30, 300],
                               labels=['very_short', 'short', 'medium', 'long', 'very_long'])

# Clinical activity intensity features
if all(col in df.columns for col in ['NumLabs', 'LOSdays']):
    df['lab_intensity'] = df['NumLabs'] / (df['LOSdays'] + 1)
if all(col in df.columns for col in ['NumRx', 'LOSdays']):
    df['med_intensity'] = df['NumRx'] / (df['LOSdays'] + 1)
if all(col in df.columns for col in ['NumProcs', 'LOSdays']):
    df['procedure_intensity'] = df['NumProcs'] / (df['LOSdays'] + 1)
if all(col in df.columns for col in ['NumChartEvents', 'LOSdays']):
    df['monitoring_intensity'] = df['NumChartEvents'] / (df['LOSdays'] + 1)

# Total clinical burden score
burden_cols = ['NumDiagnosis', 'NumProcs', 'NumCallouts']
if all(col in df.columns for col in burden_cols):
    df['clinical_burden'] = df[burden_cols].sum(axis=1)

# Interaction density
if all(col in df.columns for col in ['TotalNumInteract', 'LOSdays']):
    df['interaction_density'] = df['TotalNumInteract'] / (df['LOSdays'] + 1)

# High-risk indicators
if 'age' in df.columns:
    df['high_age'] = (df['age'] >= 70).astype(int)
if 'LOSdays' in df.columns:
    df['high_los'] = (df['LOSdays'] >= 14).astype(int)
if 'NumDiagnosis' in df.columns:
    df['high_diagnoses'] = (df['NumDiagnosis'] >= df['NumDiagnosis'].quantile(0.75)).astype(int)
if 'admit_type' in df.columns:
    df['emergency_admit'] = (df['admit_type'] == 'EMERGENCY').astype(int)

# ICU indicators
if 'NumChartEvents' in df.columns:
    df['icu_indicator'] = (df['NumChartEvents'] >= df['NumChartEvents'].quantile(0.90)).astype(int)

new_features = [col for col in df.columns if col not in data.columns]
print(f"✓ New features created: {len(new_features)}")
for feature in new_features[:10]:  # Show first 10
    print(f"  - {feature}")
if len(new_features) > 10:
    print(f"  ... and {len(new_features) - 10} more")

# 3. ENCODE CATEGORICAL VARIABLES
print("\n3. ENCODING CATEGORICAL VARIABLES...")
print("-" * 40)

# Initialize label encoders
label_encoders = {}

# Categorical columns to encode
categorical_cols = []
for col in ['gender', 'admit_type', 'admit_location', 'insurance', 'religion', 
           'marital_status', 'ethnicity', 'age_group', 'los_category']:
    if col in df.columns:
        categorical_cols.append(col)

# Apply label encoding
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le
    print(f"  ✓ {col}: {len(le.classes_)} categories -> encoded")

# 4. HANDLE HIGH CARDINALITY FEATURES
print("\n4. HANDLING HIGH CARDINALITY FEATURES...")
print("-" * 40)

# Handle AdmitDiagnosis if present
if 'AdmitDiagnosis' in df.columns:
    diagnosis_freq = df['AdmitDiagnosis'].value_counts()
    top_diagnoses = diagnosis_freq.head(50).index
    df['AdmitDiagnosis_grouped'] = df['AdmitDiagnosis'].apply(
        lambda x: x if x in top_diagnoses else 'OTHER'
    )
    le_diag = LabelEncoder()
    df['AdmitDiagnosis_encoded'] = le_diag.fit_transform(df['AdmitDiagnosis_grouped'])
    print(f"  ✓ AdmitDiagnosis: {len(diagnosis_freq)} -> {len(le_diag.classes_)} categories")

# Handle AdmitProcedure if present
if 'AdmitProcedure' in df.columns:
    procedure_freq = df['AdmitProcedure'].value_counts()
    top_procedures = procedure_freq.head(50).index
    df['AdmitProcedure_grouped'] = df['AdmitProcedure'].apply(
        lambda x: x if x in top_procedures else 'OTHER'
    )
    le_proc = LabelEncoder()
    df['AdmitProcedure_encoded'] = le_proc.fit_transform(df['AdmitProcedure_grouped'])
    print(f"  ✓ AdmitProcedure: {len(procedure_freq)} -> {len(le_proc.classes_)} categories")

# 5. FEATURE SELECTION FOR MODELING
print("\n5. PREPARING FEATURE SET...")
print("-" * 40)

# Build feature list dynamically based on available columns
feature_columns = []

# Demographics
for col in ['age', 'gender_encoded', 'ethnicity_encoded']:
    if col in df.columns:
        feature_columns.append(col)

# Admission details
for col in ['admit_type_encoded', 'admit_location_encoded', 'insurance_encoded',
           'religion_encoded', 'marital_status_encoded']:
    if col in df.columns:
        feature_columns.append(col)

# Clinical features
clinical_cols = ['LOSdays', 'NumDiagnosis', 'NumProcs', 'NumCallouts',
                'NumCPTevents', 'NumInput', 'NumLabs', 'NumMicroLabs',
                'NumNotes', 'NumOutput', 'NumRx', 'NumProcEvents',
                'NumTransfers', 'NumChartEvents', 'TotalNumInteract']
for col in clinical_cols:
    if col in df.columns:
        feature_columns.append(col)

# Encoded diagnosis and procedure
for col in ['AdmitDiagnosis_encoded', 'AdmitProcedure_encoded']:
    if col in df.columns:
        feature_columns.append(col)

# Engineered features
engineered_cols = ['age_group_encoded', 'los_category_encoded',
                  'lab_intensity', 'med_intensity', 'procedure_intensity', 'monitoring_intensity',
                  'clinical_burden', 'interaction_density',
                  'high_age', 'high_los', 'high_diagnoses', 'emergency_admit', 'icu_indicator']
for col in engineered_cols:
    if col in df.columns:
        feature_columns.append(col)

# Create feature matrix
X = df[feature_columns].copy()
y = df['ExpiredHospital'].copy()

print(f"✓ Final feature set: {X.shape[1]} features")
print(f"✓ Target distribution: {y.value_counts().to_dict()}")

# Store feature names for later use
feature_names = X.columns.tolist()

# 6. HANDLE OUTLIERS
print("\n6. OUTLIER HANDLING...")
print("-" * 40)

continuous_features = []
for col in ['LOSdays', 'NumDiagnosis', 'NumInput', 'NumLabs', 'NumChartEvents',
           'TotalNumInteract', 'lab_intensity', 'med_intensity', 'monitoring_intensity']:
    if col in X.columns:
        continuous_features.append(col)

outlier_stats = {}
for feature in continuous_features:
    Q1 = X[feature].quantile(0.25)
    Q3 = X[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_before = ((X[feature] < lower_bound) | (X[feature] > upper_bound)).sum()
    X[feature] = np.clip(X[feature], lower_bound, upper_bound)
    outlier_stats[feature] = outliers_before

print("✓ Outliers capped per feature:")
for feature, count in outlier_stats.items():
    if count > 0:
        print(f"  {feature}: {count} outliers capped")

# 7. FINAL DATASET VALIDATION
print(f"\n7. FINAL PREPROCESSED DATASET:")
print("-" * 40)
print(f"✓ Shape: {X.shape}")
print(f"✓ Target distribution:")
print(f"  - Survived (0): {(y==0).sum()} ({(y==0).sum()/len(y)*100:.1f}%)")
print(f"  - Expired (1): {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")
print(f"✓ Missing values in X: {X.isnull().sum().sum()}")
print(f"✓ Missing values in y: {y.isnull().sum()}")
print(f"✓ Infinite values: {np.isinf(X).sum().sum()}")

# ============================================================================
# STEP 3: TRAIN-TEST SPLIT AND SCALING
# ============================================================================
print("\n" + "=" * 100)
print("STEP 3: TRAIN-TEST SPLIT AND SCALING")
print("=" * 100)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✓ Training set: {X_train.shape[0]} samples")
print(f"✓ Test set: {X_test.shape[0]} samples")
print(f"✓ Training class distribution: {y_train.value_counts().to_dict()}")
print(f"✓ Test class distribution: {y_test.value_counts().to_dict()}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✓ Features scaled using StandardScaler")

# Calculate class weights for imbalance handling
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
scale_pos_weight = class_weights[1] / class_weights[0]

print(f"✓ Class weights: {class_weight_dict}")
print(f"✓ Scale pos weight: {scale_pos_weight:.3f}")

# ============================================================================
# STEP 4: MODEL EVALUATION FUNCTIONS
# ============================================================================

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")
CLASS_NAMES = ['Survived', 'Expired']

def evaluate_model_comprehensive(model, X_test, y_test, model_name):
    """Comprehensive model evaluation with all metrics"""
    y_pred = model.predict(X_test)
    
    # Get prediction probabilities if available
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_proba = model.decision_function(X_test)
    else:
        y_proba = y_pred
    
    # Calculate all metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, average='weighted', zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else 0.0
    }
    
    return metrics, y_pred, y_proba

def plot_comprehensive_results(results_df):
    """Create comprehensive visualization of all results"""
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    
    # 1. Accuracy comparison
    results_df_sorted = results_df.sort_values('Accuracy', ascending=True)
    axes[0, 0].barh(results_df_sorted['Model'], results_df_sorted['Accuracy'], color='skyblue')
    axes[0, 0].set_xlabel('Accuracy')
    axes[0, 0].set_title('Model Accuracy Comparison')
    axes[0, 0].set_xlim(0, 1)
    for i, v in enumerate(results_df_sorted['Accuracy']):
        axes[0, 0].text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=8)
    
    # 2. F1-Score comparison
    results_df_sorted = results_df.sort_values('F1-Score', ascending=True)
    axes[0, 1].barh(results_df_sorted['Model'], results_df_sorted['F1-Score'], color='lightcoral')
    axes[0, 1].set_xlabel('F1-Score')
    axes[0, 1].set_title('Model F1-Score Comparison')
    axes[0, 1].set_xlim(0, 1)
    for i, v in enumerate(results_df_sorted['F1-Score']):
        axes[0, 1].text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=8)
    
    # 3. ROC-AUC comparison
    results_df_sorted = results_df.sort_values('ROC-AUC', ascending=True)
    axes[0, 2].barh(results_df_sorted['Model'], results_df_sorted['ROC-AUC'], color='lightgreen')
    axes[0, 2].set_xlabel('ROC-AUC')
    axes[0, 2].set_title('Model ROC-AUC Comparison')
    axes[0, 2].set_xlim(0, 1)
    for i, v in enumerate(results_df_sorted['ROC-AUC']):
        axes[0, 2].text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=8)
    
    # 4. Precision vs Recall scatter plot
    axes[1, 0].scatter(results_df['Recall'], results_df['Precision'], 
                      s=100, alpha=0.7, c=range(len(results_df)), cmap='viridis')
    axes[1, 0].set_xlabel('Recall')
    axes[1, 0].set_ylabel('Precision')
    axes[1, 0].set_title('Precision vs Recall')
    axes[1, 0].grid(True, alpha=0.3)
    for i, model in enumerate(results_df['Model']):
        axes[1, 0].annotate(model[:15], (results_df['Recall'].iloc[i], results_df['Precision'].iloc[i]),
                           xytext=(5, 5), textcoords='offset points', fontsize=7, rotation=15)
    
    # 5. All metrics comparison for top 5 models
    top_5_models = results_df.nlargest(5, 'Accuracy')
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
    
    x = np.arange(len(metrics))
    width = 0.15
    
    for i, (idx, row) in enumerate(top_5_models.iterrows()):
        values = [row[metric] for metric in metrics]
        axes[1, 1].bar(x + i * width, values, width, label=row['Model'][:12], alpha=0.8)
    
    axes[1, 1].set_xlabel('Metrics')
    axes[1, 1].set_ylabel('Score')
    axes[1, 1].set_title('Top 5 Models - All Metrics Comparison')
    axes[1, 1].set_xticks(x + width * 2)
    axes[1, 1].set_xticklabels(metrics, rotation=45)
    axes[1, 1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1, 1].grid(True, alpha=0.3)
    axes[1, 1].set_ylim(0, 1.1)
    
    # 6. Model ranking heatmap
    ranking_data = results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']].set_index('Model')
    im = axes[1, 2].imshow(ranking_data.values, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    axes[1, 2].set_xticks(range(len(ranking_data.columns)))
    axes[1, 2].set_xticklabels(ranking_data.columns, rotation=45)
    axes[1, 2].set_yticks(range(len(ranking_data.index)))
    axes[1, 2].set_yticklabels([name[:15] for name in ranking_data.index])
    axes[1, 2].set_title('Model Performance Heatmap')
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=axes[1, 2])
    cbar.set_label('Score')
    
    # Add text annotations
    for i in range(len(ranking_data.index)):
        for j in range(len(ranking_data.columns)):
            text = axes[1, 2].text(j, i, f'{ranking_data.iloc[i, j]:.2f}',
                                 ha="center", va="center", color="black", fontsize=7)
    
    plt.tight_layout()
    plt.show()

def plot_roc_curves_comprehensive(models_results, X_test, y_test):
    """Plot ROC curves for all models"""
    plt.figure(figsize=(12, 8))
    
    for model_name, (model, _, y_proba) in models_results.items():
        if len(np.unique(y_test)) > 1:
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.3f})', linewidth=2)
    
    plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves Comparison - All Models', fontsize=14)
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

def plot_confusion_matrices_comprehensive(models_results, y_test):
    """Plot confusion matrices for top 6 models"""
    models_list = list(models_results.items())[:6]
    n_models = len(models_list)
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.ravel()
    
    for i, (model_name, (model, y_pred, _)) in enumerate(models_list):
        cm = confusion_matrix(y_test, y_pred)
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                   xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
        axes[i].set_title(f'Confusion Matrix - {model_name}', fontsize=12)
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')
    
    # Hide unused subplots
    for i in range(n_models, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

# ============================================================================
# STEP 5: MODEL TRAINING AND HYPERPARAMETER TUNING
# ============================================================================
print("\n" + "=" * 100)
print("STEP 5: MODEL TRAINING AND HYPERPARAMETER TUNING")
print("=" * 100)

# Define models with parameter grids
models_params = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42, class_weight='balanced'),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [5, 10, None],
            'min_samples_split': [2, 5]
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.1, 0.2],
            'scale_pos_weight': [scale_pos_weight]
        }
    },
    'LightGBM': {
        'model': lgb.LGBMClassifier(random_state=42, verbose=-1, class_weight='balanced'),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.1, 0.2]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 4],
            'learning_rate': [0.1, 0.2]
        }
    },
    'Support Vector Machine': {
        'model': SVC(random_state=42, probability=True, class_weight='balanced'),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'Neural Network': {
        'model': MLPClassifier(random_state=42, max_iter=1000),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001]
        }
    }
}

# Store results
tuned_models = {}
all_results = []
model_predictions = {}

print("Starting hyperparameter tuning...")

for model_name, model_info in models_params.items():
    print(f"\n🔧 Tuning {model_name}...")
    
    try:
        # Grid search with cross-validation
        grid_search = GridSearchCV(
            model_info['model'], 
            model_info['params'], 
            cv=3,  # Reduced for faster execution
            scoring='roc_auc',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
        tuned_models[model_name] = best_model
        
        # Comprehensive evaluation
        metrics, y_pred, y_proba = evaluate_model_comprehensive(best_model, X_test_scaled, y_test, model_name)
        all_results.append(metrics)
        model_predictions[model_name] = (best_model, y_pred, y_proba)
        
        print(f"  ✓ Best params: {grid_search.best_params_}")
        print(f"  ✓ Best CV score: {grid_search.best_score_:.4f}")
        print(f"  ✓ Test ROC-AUC: {metrics['ROC-AUC']:.4f}")
        
    except Exception as e:
        print(f"  ❌ Error training {model_name}: {str(e)}")
        continue

# ============================================================================
# STEP 6: ENSEMBLE METHODS (CONTINUED)
# ============================================================================

# Create ensemble models
ensemble_models = {}

try:
    # 1. Voting Classifier (Soft)
    print("🔧 Training Voting Classifier...")
    voting_estimators = [(name.replace(' ', '_').lower(), model) for name, model in tuned_models.items()]
    
    voting_soft = VotingClassifier(
        estimators=voting_estimators,
        voting='soft'
    )
    voting_soft.fit(X_train_scaled, y_train)
    ensemble_models['Voting Classifier (Soft)'] = voting_soft
    
    # Evaluate
    metrics, y_pred, y_proba = evaluate_model_comprehensive(voting_soft, X_test_scaled, y_test, 'Voting Classifier (Soft)')
    all_results.append(metrics)
    model_predictions['Voting Classifier (Soft)'] = (voting_soft, y_pred, y_proba)
    print(f"  ✓ Voting Classifier ROC-AUC: {metrics['ROC-AUC']:.4f}")
    
except Exception as e:
    print(f"  ❌ Error training Voting Classifier: {str(e)}")

try:
    # 2. AdaBoost
    print("🔧 Training AdaBoost...")
    ada_boost = AdaBoostClassifier(
        n_estimators=100,
        learning_rate=1.0,
        random_state=42
    )
    ada_boost.fit(X_train_scaled, y_train)
    ensemble_models['AdaBoost'] = ada_boost
    
    # Evaluate
    metrics, y_pred, y_proba = evaluate_model_comprehensive(ada_boost, X_test_scaled, y_test, 'AdaBoost')
    all_results.append(metrics)
    model_predictions['AdaBoost'] = (ada_boost, y_pred, y_proba)
    print(f"  ✓ AdaBoost ROC-AUC: {metrics['ROC-AUC']:.4f}")
    
except Exception as e:
    print(f"  ❌ Error training AdaBoost: {str(e)}")

try:
    # 3. Bagging Classifier
    print("🔧 Training Bagging Classifier...")
    bagging = BaggingClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    )
    bagging.fit(X_train_scaled, y_train)
    ensemble_models['Bagging Classifier'] = bagging
    
    # Evaluate
    metrics, y_pred, y_proba = evaluate_model_comprehensive(bagging, X_test_scaled, y_test, 'Bagging Classifier')
    all_results.append(metrics)
    model_predictions['Bagging Classifier'] = (bagging, y_pred, y_proba)
    print(f"  ✓ Bagging Classifier ROC-AUC: {metrics['ROC-AUC']:.4f}")
    
except Exception as e:
    print(f"  ❌ Error training Bagging Classifier: {str(e)}")



# ============================================================================
# STEP 8: RESULTS COMPILATION AND ANALYSIS
# ============================================================================
print("\n" + "=" * 100)
print("STEP 8: COMPREHENSIVE RESULTS ANALYSIS")
print("=" * 100)

# Create results DataFrame
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('ROC-AUC', ascending=False).reset_index(drop=True)

print("📊 FINAL MODEL PERFORMANCE SUMMARY:")
print("=" * 80)
print(results_df.to_string(index=False, float_format='%.4f'))

# Top 3 models
print(f"\n🏆 TOP 3 MODELS:")
print("-" * 40)
for i, (idx, row) in enumerate(results_df.head(3).iterrows(), 1):
    print(f"{i}. {row['Model']}")
    print(f"   ROC-AUC: {row['ROC-AUC']:.4f} | Accuracy: {row['Accuracy']:.4f} | F1-Score: {row['F1-Score']:.4f}")

# ============================================================================
# STEP 9: DETAILED ANALYSIS OF BEST MODEL
# ============================================================================
print("\n" + "=" * 100)
print("STEP 9: DETAILED ANALYSIS OF BEST MODEL")
print("=" * 100)

best_model_name = results_df.iloc[0]['Model']
best_model = model_predictions[best_model_name][0]
best_y_pred = model_predictions[best_model_name][1]

print(f"🎯 BEST MODEL: {best_model_name}")
print(f"🔍 DETAILED PERFORMANCE ANALYSIS:")
print("-" * 60)

# Classification report
print("Classification Report:")
print(classification_report(y_test, best_y_pred, target_names=CLASS_NAMES))

# Confusion matrix details
cm = confusion_matrix(y_test, best_y_pred)
tn, fp, fn, tp = cm.ravel()

print(f"\nConfusion Matrix Breakdown:")
print(f"True Negatives (Correctly predicted survivors): {tn}")
print(f"False Positives (Incorrectly predicted deaths): {fp}")
print(f"False Negatives (Missed deaths): {fn}")
print(f"True Positives (Correctly predicted deaths): {tp}")

# Additional metrics
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
precision_class_1 = tp / (tp + fp) if (tp + fp) > 0 else 0

print(f"\nAdditional Clinical Metrics:")
print(f"Sensitivity (Recall for deaths): {sensitivity:.4f}")
print(f"Specificity (Recall for survivors): {specificity:.4f}")
print(f"Precision for death prediction: {precision_class_1:.4f}")

# ============================================================================
# STEP 10: FEATURE IMPORTANCE ANALYSIS
# ============================================================================
print("\n" + "=" * 100)
print("STEP 10: FEATURE IMPORTANCE ANALYSIS")
print("=" * 100)

# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    print(f"🌟 FEATURE IMPORTANCE ANALYSIS ({best_model_name}):")
    print("-" * 60)
    
    # Get feature importances
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 15 Most Important Features:")
    for i, (idx, row) in enumerate(feature_importance.head(15).iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<25} {row['importance']:.4f}")
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

elif hasattr(best_model, 'coef_'):
    print(f"🌟 FEATURE COEFFICIENTS ANALYSIS ({best_model_name}):")
    print("-" * 60)
    
    # Get feature coefficients
    if len(best_model.coef_.shape) > 1:
        coefficients = best_model.coef_[0]
    else:
        coefficients = best_model.coef_
    
    feature_coeff = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values('abs_coefficient', ascending=False)
    
    print("Top 15 Most Important Features (by absolute coefficient):")
    for i, (idx, row) in enumerate(feature_coeff.head(15).iterrows(), 1):
        print(f"{i:2d}. {row['feature']:<25} {row['coefficient']:8.4f}")

# ============================================================================
# STEP 11: COMPREHENSIVE VISUALIZATIONS
# ============================================================================
print("\n" + "=" * 100)
print("STEP 11: COMPREHENSIVE VISUALIZATIONS")
print("=" * 100)

# 1. Comprehensive results visualization
print("📈 Generating comprehensive performance comparison...")
plot_comprehensive_results(results_df)

# 2. ROC curves for all models
print("📈 Generating ROC curves comparison...")
plot_roc_curves_comprehensive(model_predictions, X_test_scaled, y_test)

# 3. Confusion matrices for top models
print("📈 Generating confusion matrices...")
plot_confusion_matrices_comprehensive(model_predictions, y_test)

# 4. Learning curves for best model
print("📈 Generating learning curve for best model...")
if best_model_name in tuned_models or best_model_name in ensemble_models:
    train_sizes, train_scores, val_scores = learning_curve(
        best_model, X_train_scaled, y_train, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='roc_auc'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', label='Training Score', color='blue')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2, color='blue')
    plt.plot(train_sizes, val_mean, 'o-', label='Validation Score', color='red')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.2, color='red')
    plt.xlabel('Training Set Size')
    plt.ylabel('ROC-AUC Score')
    plt.title(f'Learning Curve - {best_model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# ============================================================================
# STEP 12: CROSS-VALIDATION ANALYSIS
# ============================================================================
print("\n" + "=" * 100)
print("STEP 12: CROSS-VALIDATION ANALYSIS")
print("=" * 100)

print("🔄 Performing comprehensive cross-validation analysis...")

# Cross-validation for top 5 models
cv_results = {}
for model_name in results_df.head(5)['Model']:
    if model_name in model_predictions:
        model = model_predictions[model_name][0]
        try:
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
            cv_results[model_name] = {
                'mean': cv_scores.mean(),
                'std': cv_scores.std(),
                'scores': cv_scores
            }
            print(f"{model_name:<25} CV ROC-AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        except Exception as e:
            print(f"❌ CV Error for {model_name}: {str(e)}")

# Visualize CV results
if cv_results:
    plt.figure(figsize=(12, 6))
    models_cv = list(cv_results.keys())
    cv_means = [cv_results[model]['mean'] for model in models_cv]
    cv_stds = [cv_results[model]['std'] for model in models_cv]
    
    plt.bar(models_cv, cv_means, yerr=cv_stds, capsize=5, alpha=0.7, color='lightblue')
    plt.xlabel('Models')
    plt.ylabel('Cross-Validation ROC-AUC Score')
    plt.title('Cross-Validation Performance Comparison (Top 5 Models)')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, (mean, std) in enumerate(zip(cv_means, cv_stds)):
        plt.text(i, mean + std + 0.01, f'{mean:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# ============================================================================
# STEP 13: FINAL SUMMARY AND RECOMMENDATIONS
# ============================================================================
print("\n" + "=" * 100)
print("FINAL SUMMARY AND CLINICAL RECOMMENDATIONS")
print("=" * 100)

print("🎯 EXECUTIVE SUMMARY:")
print("-" * 60)
print(f"✓ Total models evaluated: {len(all_results)}")
print(f"✓ Best performing model: {best_model_name}")
print(f"✓ Best ROC-AUC score: {results_df.iloc[0]['ROC-AUC']:.4f}")
print(f"✓ Best accuracy: {results_df.iloc[0]['Accuracy']:.4f}")
print(f"✓ Dataset size: {len(df)} patients")
print(f"✓ Features used: {len(feature_names)}")

print(f"\n🏥 CLINICAL IMPLICATIONS:")
print("-" * 60)
print(f"• The model can predict hospital mortality with {results_df.iloc[0]['Accuracy']*100:.1f}% accuracy")
print(f"• Sensitivity (death detection): {sensitivity*100:.1f}%")
print(f"• Specificity (survivor detection): {specificity*100:.1f}%")
print(f"• This tool can assist clinicians in identifying high-risk patients")



print(f"\n✅ ANALYSIS COMPLETE!")
print("=" * 100)

# Save results to file (optional)
try:
    results_df.to_csv('healthcare_ml_results.csv', index=False)
    print("📁 Results saved to 'healthcare_ml_results.csv'")
except:
    print("📁 Could not save results to file")

print("\n🎉 Healthcare ML Pipeline Analysis Successfully Completed!")
print("=" * 100)