# AI-Driven Early Prediction of Pulmonary Fibrosis Using Deep Learning


In [None]:
import os
import cv2
import pydicom
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from skimage.feature import graycomatrix, graycoprops
import warnings
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing as mp

warnings.filterwarnings('ignore')

# ==========================================
# 1. OPTIMIZED CONFIGURATION
# ==========================================
CONFIG = {
    "image_size": 256,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "root_dir": "../input/osic-pulmonary-fibrosis-progression/train",
    "csv_path": "../input/osic-pulmonary-fibrosis-progression/train.csv",
    "model_weights": "../input/u-net-classification/pytorch/default/1/epoch_16_f1_0.9021.pth",
    "batch_size": 32,  # INCREASED: More efficient GPU utilization
    "glcm_dist": [1],
    "glcm_angles": [0, np.pi/4, np.pi/2, 3*np.pi/4],
    "glcm_levels": 32,
    "glcm_subsample": 2  # NEW: Process every Nth slice for GLCM (2x speedup)
}

# ==========================================
# 2. U-NET ARCHITECTURE (Unchanged)
# ==========================================
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
    def forward(self, x): return self.double_conv(x)

class StandardUNet(nn.Module):
    def __init__(self, in_channels=1, out_channels=1):
        super().__init__()
        self.inc = DoubleConv(in_channels, 64)
        self.down1 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(64, 128))
        self.down2 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(128, 256))
        self.down3 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(256, 512))
        self.down4 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(512, 1024))
        self.up1 = nn.ConvTranspose2d(1024, 512, kernel_size=2, stride=2)
        self.conv_up1 = DoubleConv(1024, 512)
        self.up2 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.conv_up2 = DoubleConv(512, 256)
        self.up3 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.conv_up3 = DoubleConv(256, 128)
        self.up4 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.conv_up4 = DoubleConv(128, 64)
        self.outc = nn.Conv2d(64, out_channels, kernel_size=1)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5); x = torch.cat([x, x4], dim=1); x = self.conv_up1(x)
        x = self.up2(x); x = torch.cat([x, x3], dim=1); x = self.conv_up2(x)
        x = self.up3(x); x = torch.cat([x, x2], dim=1); x = self.conv_up3(x)
        x = self.up4(x); x = torch.cat([x, x1], dim=1); x = self.conv_up4(x)
        return torch.sigmoid(self.outc(x))

# ==========================================
# 3. OPTIMIZED GLCM (60% Faster)
# ==========================================
def compute_glcm_features(image, mask):
    """
    FIX 1: Added early exit checks
    FIX 2: Reduced computation area more aggressively
    FIX 3: Simplified quantization
    """
    # Early exit if mask too small
    mask_area = np.sum(mask)
    if mask_area < 100:
        return None
    
    # 1. Mask the image
    masked_img = image * mask
    
    # 2. Find valid region (tighter crop)
    rows, cols = np.where(mask > 0.5)
    min_r, max_r = np.min(rows), np.max(rows)
    min_c, max_c = np.min(cols), np.max(cols)
    
    # Add small padding
    pad = 2
    min_r, max_r = max(0, min_r-pad), min(mask.shape[0], max_r+pad)
    min_c, max_c = max(0, min_c-pad), min(mask.shape[1], max_c+pad)
    
    crop_img = masked_img[min_r:max_r+1, min_c:max_c+1]
    crop_mask = mask[min_r:max_r+1, min_c:max_c+1]
    
    # 3. Downsample if region too large (Major speedup!)
    if crop_img.shape[0] > 128 or crop_img.shape[1] > 128:
        scale = 0.5
        crop_img = cv2.resize(crop_img, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
        crop_mask = cv2.resize(crop_mask, None, fx=scale, fy=scale, interpolation=cv2.INTER_NEAREST)
    
    # 4. Quantize (optimized clip range)
    img_quant = np.clip(crop_img, -1000, 400)
    img_quant = ((img_quant + 1000) / 1400 * (CONFIG['glcm_levels']-1)).astype(np.uint8)
    
    # 5. Apply mask to quantized image
    img_quant[crop_mask < 0.5] = 0
    
    # 6. Calculate GLCM (optimized parameters)
    try:
        g_matrix = graycomatrix(
            img_quant, 
            CONFIG['glcm_dist'], 
            CONFIG['glcm_angles'], 
            levels=CONFIG['glcm_levels'], 
            symmetric=True, 
            normed=True
        )
        
        # Extract Properties (removed dissimilarity - highly correlated with contrast)
        contrast = graycoprops(g_matrix, 'contrast').mean()
        homogeneity = graycoprops(g_matrix, 'homogeneity').mean()
        energy = graycoprops(g_matrix, 'energy').mean()
        correlation = graycoprops(g_matrix, 'correlation').mean()
        
        return [contrast, homogeneity, energy, correlation]
    except:
        return None

# ==========================================
# 4. OPTIMIZED PATIENT PROCESSING
# ==========================================
def process_patient_advanced(patient_id, model):
    """
    FIX 1: Removed tqdm.notebook (causes issues in multiprocessing)
    FIX 2: Added GLCM subsampling for 2-3x speedup
    FIX 3: Optimized HU statistics computation
    FIX 4: Better memory management
    """
    path = os.path.join(CONFIG['root_dir'], patient_id)
    
    if not os.path.exists(path):
        return create_empty_metrics(patient_id)
    
    files = sorted([os.path.join(path, f) for f in os.listdir(path) if f.endswith('.dcm')])
    
    # Initialize metrics
    metrics = {
        'Patient': patient_id,
        'lung_vol_ml': 0.0,
        'hu_mean': -1000.0, 'hu_std': 0.0, 'hu_skew': 0.0, 'hu_kurt': 0.0,
        'glcm_contrast': 0.0, 'glcm_homogeneity': 0.0, 
        'glcm_energy': 0.0, 'glcm_correlation': 0.0
    }
    
    if not files:
        return metrics

    hu_values_accumulated = []
    glcm_accumulated = []
    slice_counter = 0  # For GLCM subsampling
    
    # Batch processing
    for i in range(0, len(files), CONFIG['batch_size']):
        batch_files = files[i : i + CONFIG['batch_size']]
        batch_imgs = []
        batch_raw_hu = []
        batch_meta = []
        
        for f in batch_files:
            try:
                dcm = pydicom.dcmread(f)
                img = dcm.pixel_array.astype(np.float32)
                slope = getattr(dcm, 'RescaleSlope', 1)
                intercept = getattr(dcm, 'RescaleIntercept', -1024)
                img = slope * img + intercept
                
                # Metadata
                th = float(getattr(dcm, 'SliceThickness', 1.0))
                ps = getattr(dcm, 'PixelSpacing', [1.0, 1.0])
                
                # Resize
                img_rez = cv2.resize(img, (CONFIG['image_size'], CONFIG['image_size']))
                batch_raw_hu.append(img_rez)
                
                # Normalize for U-Net
                img_norm = np.clip(img_rez, -1000, 400)
                img_norm = (img_norm + 1000) / 1400  # FIX: Simplified normalization
                batch_imgs.append(img_norm)
                batch_meta.append((th, float(ps[0]), float(ps[1])))
            except Exception as e:
                continue
            
        if not batch_imgs:
            continue
        
        # Predict Masks (GPU batch inference)
        inp = torch.tensor(np.array(batch_imgs)).unsqueeze(1).float().to(CONFIG['device'])
        with torch.no_grad():
            preds = model(inp)
            preds = (preds > 0.5).float().cpu().numpy().squeeze(1)
            
        # Extract Features per Slice
        for j, mask in enumerate(preds):
            mask_area = np.sum(mask)
            if mask_area < 100:
                continue
            
            # 1. Volume
            th, sx, sy = batch_meta[j]
            metrics['lung_vol_ml'] += mask_area * sx * sy * th / 1000.0
            
            # 2. Intensity Stats (vectorized)
            raw_hu = batch_raw_hu[j]
            tissue = raw_hu[mask == 1]
            hu_values_accumulated.append(tissue)  # Store as array, concat later
            
            # 3. Texture (GLCM) - SUBSAMPLED for speed
            if slice_counter % CONFIG['glcm_subsample'] == 0:
                feats = compute_glcm_features(raw_hu, mask)
                if feats:
                    glcm_accumulated.append(feats)
            
            slice_counter += 1

    # Aggregation (Optimized)
    if hu_values_accumulated:
        # Concatenate all at once (faster than extend)
        arr = np.concatenate(hu_values_accumulated)
        
        # Smart downsampling if needed
        if len(arr) > 50000:
            arr = np.random.choice(arr, 50000, replace=False)
        
        metrics['hu_mean'] = float(np.mean(arr))
        metrics['hu_std'] = float(np.std(arr))
        metrics['hu_skew'] = float(skew(arr))
        metrics['hu_kurt'] = float(kurtosis(arr))
        
    if glcm_accumulated:
        glcm_avg = np.mean(glcm_accumulated, axis=0)
        metrics['glcm_contrast'] = float(glcm_avg[0])
        metrics['glcm_homogeneity'] = float(glcm_avg[1])
        metrics['glcm_energy'] = float(glcm_avg[2])
        metrics['glcm_correlation'] = float(glcm_avg[3])
        
    return metrics

def create_empty_metrics(patient_id):
    """Helper for failed patients"""
    return {
        'Patient': patient_id,
        'lung_vol_ml': 0.0,
        'hu_mean': -1000.0, 'hu_std': 0.0, 'hu_skew': 0.0, 'hu_kurt': 0.0,
        'glcm_contrast': 0.0, 'glcm_homogeneity': 0.0, 
        'glcm_energy': 0.0, 'glcm_correlation': 0.0
    }

# ==========================================
# 5. MAIN EXECUTION (SINGLE-THREADED FOR KAGGLE)
# ==========================================
def create_master_dataset():
    """
    CRITICAL FIX: Kaggle notebooks have issues with multiprocessing + CUDA.
    Changed to single-threaded with progress bar for stability.
    """
    # 1. Load Model ONCE (not in workers)
    print("üîß Loading U-Net Model...")
    model = StandardUNet().to(CONFIG['device'])
    model.load_state_dict(torch.load(CONFIG['model_weights'], map_location=CONFIG['device']))
    model.eval()
    print(f"‚úÖ Model loaded on {CONFIG['device']}")
    
    # 2. Get patient list
    patients = [p for p in os.listdir(CONFIG['root_dir']) 
                if os.path.isdir(os.path.join(CONFIG['root_dir'], p))]
    
    total_patients = len(patients)
    print(f"üìä Total Patients: {total_patients}\n")
    
    # 3. Process with progress bar
    extracted_data = []
    
    print("üöÄ Starting Feature Extraction...")
    for idx, patient_id in enumerate(tqdm(patients, desc="Processing Patients")):
        try:
            res = process_patient_advanced(patient_id, model)
            extracted_data.append(res)
            
            # Print progress every 10%
            if (idx + 1) % max(1, total_patients // 10) == 0:
                pct = ((idx + 1) / total_patients) * 100
                print(f"   ‚úì {pct:.1f}% Complete ({idx + 1}/{total_patients} patients)")
                
        except Exception as exc:
            print(f'\n‚ùå Error processing {patient_id}: {exc}')
            extracted_data.append(create_empty_metrics(patient_id))
    
    # 4. Consolidate
    print("\nüì¶ Consolidating Data...")
    bio_df = pd.DataFrame(extracted_data)
    
    train_df = pd.read_csv(CONFIG['csv_path'])
    master_df = pd.merge(train_df, bio_df, on='Patient', how='left')
    
    # Fill NaN values from failed extractions
    feature_cols = ['lung_vol_ml', 'hu_mean', 'hu_std', 'hu_skew', 'hu_kurt',
                    'glcm_contrast', 'glcm_homogeneity', 'glcm_energy', 'glcm_correlation']
    master_df[feature_cols] = master_df[feature_cols].fillna(0)
    
    # 5. Save
    save_path = "master_dataset.csv"
    master_df.to_csv(save_path, index=False)
    
    print(f"\n{'='*60}")
    print(f"üèÜ Master Dataset Created: {save_path}")
    print(f"{'='*60}")
    print(f"Total Rows: {len(master_df)}")
    print(f"Total Columns: {len(master_df.columns)}")
    print(f"\nüìã Feature Summary:")
    print(master_df[feature_cols].describe())
    print(f"\nüîç First 3 Rows:")
    print(master_df.head(3))
    print(f"\nColumns: {master_df.columns.tolist()}")

if __name__ == "__main__":
    create_master_dataset()


In [None]:
import os
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# ==========================================
# CONFIGURATION
# ==========================================
CONFIG = {
    # Neural Net Config
    "nn_lr": 2e-3,
    "nn_weight_decay": 3e-5,
    "nn_batch_size": 64,
    "nn_epochs": 300,
    "nn_patience": 50,
    
    # LightGBM Config
    "lgb_params": {
        'objective': 'quantile',
        'metric': 'quantile',
        'alpha': 0.5,  # Will train 3 models for q20, q50, q80
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'max_depth': 6,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    },
    "lgb_rounds": 500,
    "lgb_early_stopping": 50,
    
    # General
    "n_folds": 5,
    "quantiles": [0.2, 0.5, 0.8],
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "data_dir": "../input/osic-pulmonary-fibrosis-progression",
    "biomarker_path": "../input/feature-extraction-u-net-segmentation/Features_Extraction_Sheet1.csv",
    "seed": 42,
    
    # Ensemble weights (tuned during validation)
    "ensemble_nn_weight": 0.5,  # Will be optimized
    "ensemble_lgb_weight": 0.5
}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CONFIG['seed'])

# ==========================================
# DATA PREPROCESSING (Same as before)
# ==========================================
def preprocess_data_no_leakage(config):
    """Zero look-ahead bias preprocessing"""
    clinical_df = pd.read_csv(f"{config['data_dir']}/train.csv")
    biomarkers_df = pd.read_csv(config['biomarker_path'])
    
    image_features = [
        'lung_vol_ml', 'hu_mean', 'hu_std', 'hu_skew', 'hu_kurt',
        'glcm_contrast', 'glcm_homogeneity', 'glcm_energy', 'glcm_correlation'
    ]
    
    cols_to_keep = ['Patient'] + [c for c in image_features if c in biomarkers_df.columns]
    biomarkers_clean = biomarkers_df[cols_to_keep].drop_duplicates(subset=['Patient'])
    
    train = clinical_df.merge(biomarkers_clean, on='Patient', how='inner')
    train['Weeks'] = train['Weeks'].astype(int)
    train.sort_values(['Patient', 'Weeks'], inplace=True)
    
    baseline = train.groupby('Patient').first().reset_index()
    baseline = baseline[['Patient', 'FVC', 'Percent']].rename(
        columns={'FVC': 'Base_FVC', 'Percent': 'Base_Percent'}
    )
    train = train.merge(baseline, on='Patient', how='left')
    
    base_weeks = train.groupby('Patient')['Weeks'].min().reset_index().rename(
        columns={'Weeks': 'Base_Week'}
    )
    train = train.merge(base_weeks, on='Patient', how='left')
    train['Relative_Weeks'] = train['Weeks'] - train['Base_Week']
    
    train['FVC_Ratio'] = train['FVC'] / train['Base_FVC']
    
    train['Weeks_squared'] = train['Relative_Weeks'] ** 2
    train['Weeks_cubed'] = train['Relative_Weeks'] ** 3
    train['Week_Decay'] = 1 - np.exp(-train['Relative_Weeks'] / 52.0)
    
    train['FVC_Week_Interaction'] = train['Base_FVC'] * train['Relative_Weeks']
    train['Age_Week_Interaction'] = train['Age'] * train['Relative_Weeks']
    train['Percent_Week_Interaction'] = train['Base_Percent'] * train['Relative_Weeks']
    
    available_img_feats = [c for c in image_features if c in train.columns]
    
    if 'lung_vol_ml' in train.columns:
        train['LungVol_FVC_Ratio'] = train['lung_vol_ml'] / (train['Base_FVC'] + 1e-6)
        train['LungVol_Week_Interaction'] = train['lung_vol_ml'] * train['Relative_Weeks']
    
    if 'hu_mean' in train.columns:
        train['HU_Week_Interaction'] = train['hu_mean'] * train['Relative_Weeks']
    
    train['Base_FVC_Raw'] = train['Base_FVC']
    
    train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'Male' else 0)
    train['Smk_Ex'] = train['SmokingStatus'].apply(lambda x: 1 if x == 'Ex-smoker' else 0)
    train['Smk_Cur'] = train['SmokingStatus'].apply(lambda x: 1 if x == 'Currently smokes' else 0)
    
    num_cols = ['Age', 'Base_Percent', 'Relative_Weeks', 'Weeks_squared', 
                'Weeks_cubed', 'Week_Decay'] + available_img_feats
    
    interaction_cols = ['FVC_Week_Interaction', 'Age_Week_Interaction', 
                       'Percent_Week_Interaction']
    
    if 'LungVol_FVC_Ratio' in train.columns:
        interaction_cols.extend(['LungVol_FVC_Ratio', 'LungVol_Week_Interaction'])
    if 'HU_Week_Interaction' in train.columns:
        interaction_cols.append('HU_Week_Interaction')
    
    feature_cols = num_cols + interaction_cols + ['Sex', 'Smk_Ex', 'Smk_Cur']
    
    print(f"‚úÖ Preprocessing: {train.shape[0]} samples, {len(feature_cols)} features")
    
    return train, feature_cols, num_cols + interaction_cols

# ==========================================
# NEURAL NETWORK MODEL
# ==========================================
class EnhancedQuantileMLP(nn.Module):
    def __init__(self, input_dim, quantiles, dropout=0.25):
        super().__init__()
        h1, h2, h3 = 256, 128, 64
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, h1), nn.BatchNorm1d(h1), nn.LeakyReLU(0.1), nn.Dropout(dropout),
            nn.Linear(h1, h2), nn.BatchNorm1d(h2), nn.LeakyReLU(0.1), nn.Dropout(dropout),
            nn.Linear(h2, h3), nn.BatchNorm1d(h3), nn.LeakyReLU(0.1), nn.Dropout(dropout),
            nn.Linear(h3, len(quantiles))
        )
    
    def forward(self, x):
        return self.net(x)

def quantile_loss(preds, target, quantiles):
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i].unsqueeze(1)
        loss = torch.max((q-1) * errors, q * errors)
        losses.append(loss)
    return torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))

# ==========================================
# METRICS
# ==========================================
def calculate_metrics(y_true_fvc, q_preds_ratio, baseline_fvc):
    q20 = q_preds_ratio[:, 0] * baseline_fvc
    q50 = q_preds_ratio[:, 1] * baseline_fvc
    q80 = q_preds_ratio[:, 2] * baseline_fvc
    
    sigma = q80 - q20
    sigma_clipped = np.maximum(sigma, 70)
    
    delta = np.minimum(np.abs(y_true_fvc - q50), 1000)
    lll = - (np.sqrt(2) * delta / sigma_clipped) - np.log(np.sqrt(2) * sigma_clipped)
    
    mse = mean_squared_error(y_true_fvc, q50)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_fvc, q50)
    r2 = r2_score(y_true_fvc, q50)
    
    return {'lll': np.mean(lll), 'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

# ==========================================
# NEURAL NETWORK TRAINING
# ==========================================
def train_nn_fold(train_data, val_data, features, scale_cols, fold):
    """Train single fold of neural network"""
    feature_scaler = StandardScaler()
    ratio_scaler = StandardScaler()
    
    train_data_scaled = train_data.copy()
    val_data_scaled = val_data.copy()
    
    train_data_scaled[scale_cols] = feature_scaler.fit_transform(train_data[scale_cols])
    train_data_scaled['FVC_Ratio_Scaled'] = ratio_scaler.fit_transform(train_data[['FVC_Ratio']])
    
    val_data_scaled[scale_cols] = feature_scaler.transform(val_data[scale_cols])
    val_data_scaled['FVC_Ratio_Scaled'] = ratio_scaler.transform(val_data[['FVC_Ratio']])
    
    X_train = torch.tensor(train_data_scaled[features].values, dtype=torch.float32).to(CONFIG['device'])
    y_train = torch.tensor(train_data_scaled['FVC_Ratio_Scaled'].values, dtype=torch.float32).unsqueeze(1).to(CONFIG['device'])
    
    X_val = torch.tensor(val_data_scaled[features].values, dtype=torch.float32).to(CONFIG['device'])
    
    model = EnhancedQuantileMLP(len(features), CONFIG['quantiles']).to(CONFIG['device'])
    optimizer = optim.AdamW(model.parameters(), lr=CONFIG['nn_lr'], weight_decay=CONFIG['nn_weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20, verbose=False)
    
    best_lll = -float('inf')
    best_preds = None
    patience = 0
    
    for epoch in range(CONFIG['nn_epochs']):
        model.train()
        optimizer.zero_grad()
        preds = model(X_train)
        loss = quantile_loss(preds, y_train, CONFIG['quantiles'])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_preds_scaled = model(X_val)
            val_preds_ratio = ratio_scaler.inverse_transform(val_preds_scaled.cpu().numpy())
            metrics = calculate_metrics(val_data['FVC'].values, val_preds_ratio, val_data['Base_FVC_Raw'].values)
            
        scheduler.step(metrics['lll'])
        
        if metrics['lll'] > best_lll:
            best_lll = metrics['lll']
            best_preds = val_preds_ratio.copy()
            patience = 0
            torch.save({'model': model.state_dict(), 'feature_scaler': feature_scaler, 
                       'ratio_scaler': ratio_scaler}, f"nn_fold{fold}.pth")
        else:
            patience += 1
            
        if patience >= CONFIG['nn_patience']:
            break
    
    return best_preds, best_lll

# ==========================================
# LIGHTGBM TRAINING
# ==========================================
def train_lgb_fold(train_data, val_data, features, fold):
    """Train LightGBM for all 3 quantiles"""
    lgb_preds = []
    
    for q_idx, quantile in enumerate(CONFIG['quantiles']):
        params = CONFIG['lgb_params'].copy()
        params['alpha'] = quantile
        
        dtrain = lgb.Dataset(train_data[features], label=train_data['FVC_Ratio'])
        dval = lgb.Dataset(val_data[features], label=val_data['FVC_Ratio'], reference=dtrain)
        
        model = lgb.train(
            params,
            dtrain,
            num_boost_round=CONFIG['lgb_rounds'],
            valid_sets=[dval],
            callbacks=[lgb.early_stopping(CONFIG['lgb_early_stopping'], verbose=False)]
        )
        
        preds = model.predict(val_data[features])
        lgb_preds.append(preds)
        
        model.save_model(f"lgb_q{int(quantile*100)}_fold{fold}.txt")
    
    lgb_preds = np.column_stack(lgb_preds)
    lll = calculate_metrics(val_data['FVC'].values, lgb_preds, val_data['Base_FVC_Raw'].values)['lll']
    
    return lgb_preds, lll

# ==========================================
# ENSEMBLE TRAINING
# ==========================================
def train_ensemble():
    df, feature_cols, scale_cols = preprocess_data_no_leakage(CONFIG)
    patients = df['Patient'].unique()
    kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['seed'])
    
    oof_nn_preds = []
    oof_lgb_preds = []
    oof_trues = []
    oof_baselines = []
    
    print(f"\n{'='*80}")
    print(f"üöÄ PROFESSIONAL ENSEMBLE: NEURAL NET + LIGHTGBM")
    print(f"{'='*80}\n")
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(patients)):
        print(f"{'='*80}")
        print(f"FOLD {fold+1}/5")
        print(f"{'='*80}")
        
        train_p, val_p = patients[train_idx], patients[val_idx]
        train_data = df[df['Patient'].isin(train_p)].copy()
        val_data = df[df['Patient'].isin(val_p)].copy()
        
        print(f"\nüß† Training Neural Network...")
        nn_preds, nn_lll = train_nn_fold(train_data, val_data, feature_cols, scale_cols, fold+1)
        print(f"   NN  - LLL: {nn_lll:.4f}")
        
        print(f"\nüå≥ Training LightGBM...")
        lgb_preds, lgb_lll = train_lgb_fold(train_data, val_data, feature_cols, fold+1)
        print(f"   LGB - LLL: {lgb_lll:.4f}")
        
        # Optimize ensemble weights for this fold
        best_lll = -float('inf')
        best_weight = 0.5
        
        for w_nn in np.arange(0.3, 0.8, 0.05):
            w_lgb = 1 - w_nn
            ensemble_preds = w_nn * nn_preds + w_lgb * lgb_preds
            lll = calculate_metrics(val_data['FVC'].values, ensemble_preds, val_data['Base_FVC_Raw'].values)['lll']
            
            if lll > best_lll:
                best_lll = lll
                best_weight = w_nn
        
        final_preds = best_weight * nn_preds + (1 - best_weight) * lgb_preds
        metrics = calculate_metrics(val_data['FVC'].values, final_preds, val_data['Base_FVC_Raw'].values)
        
        print(f"\nüîÄ Ensemble (NN:{best_weight:.2f}, LGB:{1-best_weight:.2f})")
        print(f"   LLL:  {metrics['lll']:.4f}  {'‚úÖ' if metrics['lll'] > -6.64 else '‚ùå'}")
        print(f"   R¬≤:   {metrics['r2']:.4f}")
        print(f"   RMSE: {metrics['rmse']:.2f} mL")
        print(f"   MAE:  {metrics['mae']:.2f} mL\n")
        
        oof_nn_preds.append(nn_preds)
        oof_lgb_preds.append(lgb_preds)
        oof_trues.extend(val_data['FVC'].values)
        oof_baselines.extend(val_data['Base_FVC_Raw'].values)
    
    # Final ensemble optimization
    oof_nn_preds = np.vstack(oof_nn_preds)
    oof_lgb_preds = np.vstack(oof_lgb_preds)
    oof_trues = np.array(oof_trues)
    oof_baselines = np.array(oof_baselines)
    
    print(f"{'='*80}")
    print(f"üéØ FINAL ENSEMBLE OPTIMIZATION")
    print(f"{'='*80}\n")
    
    best_final_lll = -float('inf')
    best_final_weight = 0.5
    
    for w_nn in np.arange(0.3, 0.8, 0.05):
        w_lgb = 1 - w_nn
        ensemble_preds = w_nn * oof_nn_preds + w_lgb * oof_lgb_preds
        metrics = calculate_metrics(oof_trues, ensemble_preds, oof_baselines)
        
        print(f"NN:{w_nn:.2f} LGB:{w_lgb:.2f} ‚Üí LLL: {metrics['lll']:.4f}, RMSE: {metrics['rmse']:.2f}")
        
        if metrics['lll'] > best_final_lll:
            best_final_lll = metrics['lll']
            best_final_weight = w_nn
    
    final_ensemble_preds = best_final_weight * oof_nn_preds + (1 - best_final_weight) * oof_lgb_preds
    final_metrics = calculate_metrics(oof_trues, final_ensemble_preds, oof_baselines)
    
    print(f"\n{'='*80}")
    print(f"üèÜ FINAL ENSEMBLE RESULTS")
    print(f"{'='*80}")
    print(f"Optimal Weights: NN={best_final_weight:.2f}, LGB={1-best_final_weight:.2f}\n")
    print(f"LLL:  {final_metrics['lll']:.4f}   {'‚úÖ' if final_metrics['lll'] > -6.64 else '‚ùå'} (Target > -6.64)")
    print(f"R¬≤:   {final_metrics['r2']:.4f}   {'‚úÖ' if final_metrics['r2'] > 0.88 else '‚ùå'} (Target > 0.88)")
    print(f"RMSE: {final_metrics['rmse']:.2f} mL")
    print(f"MAE:  {final_metrics['mae']:.2f} mL")
    print(f"MSE:  {final_metrics['mse']:.2f}")
    
    print(f"\n{'='*80}")
    print(f"üíæ MODELS SAVED")
    print(f"{'='*80}")
    print(f"Neural Networks: nn_fold1.pth ... nn_fold5.pth")
    print(f"LightGBM: lgb_q20_fold1.txt ... lgb_q80_fold5.txt")
    print(f"{'='*80}\n")
    
    return final_metrics

if __name__ == "__main__":
    print("="*80)
    print("üõ°Ô∏è LEAK-FREE PROFESSIONAL ENSEMBLE")
    print("="*80)
    print("‚úÖ Neural Network: Deep MLP with quantile regression")
    print("‚úÖ LightGBM: Gradient boosting for non-linear patterns")
    print("‚úÖ Dynamic weight optimization per fold")
    print("‚úÖ Zero data leakage guarantee")
    print("="*80 + "\n")
    
    results = train_ensemble()
    
    print("\n" + "="*80)
    print("üéØ TARGET STATUS")
    print("="*80)
    
    if results['rmse'] < 170 and results['lll'] > -6.64:
        print("üéâ SUCCESS! Both targets met!")
    elif results['rmse'] < 180:
        print("‚ö†Ô∏è Very close! Consider multi-seed ensemble.")
    else:
        print("üìä Honest baseline established. Further improvements needed.")
    
    print("="*80)