In [None]:
def train_models(df):
    """Train multiple models with cross-validation and comparison."""
    print("\n" + "="*60)
    print("TRAINING CLASSIFICATION MODELS")
    print("="*60)
    
    df_clean = df.dropna(subset=['label'])
    X = df_clean.drop(columns=['label', 'plated_capacity'])
    y = df_clean['label'].values
    X = X.fillna(X.mean())
    
    print(f"\nFeature count: {X.shape[1]}")
    print(f"Sample count: {X.shape[0]} (Plating: {y.sum()}, No plating: {len(y)-y.sum()})")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = {}
    
    # Logistic Regression
    print("\n[1] Logistic Regression")
    print("-" * 40)
    lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    lr.fit(X_train_scaled, y_train)
    
    y_pred_lr = lr.predict(X_test_scaled)
    y_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]
    
    auc_lr = roc_auc_score(y_test, y_proba_lr)
    print(f"AUC-ROC: {auc_lr:.4f}")
    print(classification_report(y_test, y_pred_lr, target_names=['No Plating', 'Plating']))
    
    results['lr'] = {'model': lr, 'auc': auc_lr, 'y_pred': y_pred_lr, 'y_proba': y_proba_lr}
    
    # Random Forest with CV
    print("\n[2] Random Forest Classifier")
    print("-" * 40)
    rf = RandomForestClassifier(
        n_estimators=N_ESTIMATORS,
        max_depth=MAX_DEPTH,
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = cross_val_score(rf, X_train, y_train, cv=skf, scoring='roc_auc')
    print(f"Cross-Validation AUC (5-fold): {cv_scores}")
    print(f"Mean CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    
    rf.fit(X_train, y_train)
    
    y_pred_rf = rf.predict(X_test)
    y_proba_rf = rf.predict_proba(X_test)[:, 1]
    
    auc_rf = roc_auc_score(y_test, y_proba_rf)
    print(f"\nTest Set AUC-ROC: {auc_rf:.4f}")
    print(classification_report(y_test, y_pred_rf, target_names=['No Plating', 'Plating']))
    
    importances = rf.feature_importances_
    feat_imp = pd.DataFrame({
        'feature': X.columns,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feat_imp.head(10).to_string(index=False))
    
    results['rf'] = {
        'model': rf,
        'auc': auc_rf,
        'cv_scores': cv_scores,
        'y_pred': y_pred_rf,
        'y_proba': y_proba_rf,
        'feature_importance': feat_imp
    }
    
    models_dict = {
        'scaler': scaler,
        'logistic': lr,
        'random_forest': rf,
        'feature_names': list(X.columns)
    }
    joblib.dump(models_dict, 'plating_detection_models.pkl')
    print("\nâœ“ Models saved to 'plating_detection_models.pkl'")
    
    return results, X_test, y_test, scaler