# IMPORT LIS

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier  # NEW: Import CatBoost
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier # NEW: Neural Network
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

# IMPORTING DATA SET

In [3]:
# Load the Processed Data
df_train = pd.read_csv('../data/cleaned/processed_train.csv')
df_val = pd.read_csv('../data/cleaned/processed_validation.csv')
df_kaggle_test = pd.read_csv('../data/cleaned/processed_kaggle_test.csv')

In [4]:
# Define Target and ID columns
target_col = "diagnosed_diabetes"
id_col = "id"

In [5]:
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]
X_val = df_val.drop(columns=[target_col])
y_val = df_val[target_col]

In [6]:
submission_ids = df_kaggle_test[id_col]
X_kaggle_test = df_kaggle_test.drop(columns=[id_col])
X_kaggle_test = X_kaggle_test[X_train.columns]

# TRAINING

In [7]:
# --- 1. Define the Neural Network Pipeline ---
# NNs need scaling, so we wrap it in a pipeline
nn_model = make_pipeline(
    StandardScaler(),
    MLPClassifier(
        hidden_layer_sizes=(128, 64, 32), # 3 layers
        activation='relu',
        solver='adam',
        alpha=0.0001, # Regularization
        learning_rate_init=0.001,
        max_iter=500,
        early_stopping=True, # Prevent overfitting
        random_state=42
    )
)

In [8]:
# --- 2. Define All Base Models ---
models = {
    'CatBoost': CatBoostClassifier(
        iterations=1000, learning_rate=0.05, depth=6,
        eval_metric='AUC', verbose=0, random_state=42,
        allow_writing_files=False
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=1000, learning_rate=0.05, max_depth=6, 
        subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=1000, learning_rate=0.05, num_leaves=31, 
        metric='auc', n_jobs=-1, verbose=-1, random_state=42
    ),
    'ExtraTrees': ExtraTreesClassifier(
        n_estimators=500, max_depth=10, min_samples_split=10, 
        n_jobs=-1, random_state=42
    ),
    'NeuralNet': nn_model  # Adding the NN here
}

In [9]:
# --- 3. Individual Training & Validation ---
print("--- Cross-Checking Individual Models ---")
estimators_list = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    
    # Check Validation Score
    # Note: For pipeline (NN), predict_proba works automatically on the scaled data
    val_probs = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, val_probs)
    print(f"  -> {name} Validation ROC-AUC: {score:.5f}")
    
    estimators_list.append((name, model))

--- Cross-Checking Individual Models ---
Training CatBoost...
  -> CatBoost Validation ROC-AUC: 0.72476
Training XGBoost...
  -> XGBoost Validation ROC-AUC: 0.72505
Training LightGBM...
  -> LightGBM Validation ROC-AUC: 0.72580
Training ExtraTrees...
  -> ExtraTrees Validation ROC-AUC: 0.69060
Training NeuralNet...
  -> NeuralNet Validation ROC-AUC: 0.69419


In [10]:
# --- 4. Advanced Ensemble: Stacking ---
# Stacking usually beats Voting. It uses Logistic Regression to weigh the models.
print("\n--- Building Stacking Ensemble ---")


--- Building Stacking Ensemble ---


In [11]:
# The final_estimator uses the predictions of the base models as inputs
stacking_clf = StackingClassifier(
    estimators=estimators_list,
    final_estimator=LogisticRegression(C=1.0), # The "Meta Learner"
    cv=5, # Internal Cross-Validation to prevent overfitting
    n_jobs=-1,
    passthrough=False # False = Meta learner only sees predictions, not original features
)

In [12]:
stacking_clf.fit(X_train, y_train)

0,1,2
,estimators,"[('CatBoost', ...), ('XGBoost', ...), ...]"
,final_estimator,LogisticRegression()
,cv,5
,stack_method,'auto'
,n_jobs,-1
,passthrough,False
,verbose,0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,hidden_layer_sizes,"(128, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500
,shuffle,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [13]:
# Evaluate Stacking
stack_probs = stacking_clf.predict_proba(X_val)[:, 1]
stack_score = roc_auc_score(y_val, stack_probs)
print(f"Stacking Ensemble Validation ROC-AUC: {stack_score:.5f}")

Stacking Ensemble Validation ROC-AUC: 0.72654


In [14]:
# --- 5. Generate Submission ---
test_probs = stacking_clf.predict_proba(X_kaggle_test)[:, 1]

submission = pd.DataFrame({
    id_col: submission_ids,
    target_col: test_probs
})

In [15]:
submission.to_csv('../data/submission/submission_stacking_opt.csv', index=False)
print("Success! Submission saved.")

Success! Submission saved.
