# Ensemble Learning Approach

In this section, we apply an ensemble learning strategy to improve predictive performance and model robustness. Ensemble methods combine multiple base learners to reduce variance, bias, or both.

We use three complementary classifiers:

Decision Tree Classifier
Serves as a simple, interpretable baseline model.

Random Forest Classifier
An ensemble of decision trees trained on bootstrapped samples with feature randomness, helping to reduce overfitting and improve generalization.

AdaBoost Classifier
A boosting method that focuses on correcting previous classification errors by iteratively re-weighting misclassified samples.

By combining these learners, we leverage the strengths of both bagging (Random Forest) and boosting (AdaBoost) techniques, resulting in a more robust and accurate classification framework.

# IMPORT LIBS

In [15]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# IMPORTING DATA SET

In [2]:
# Load the Processed Data
df_train = pd.read_csv('../data/cleaned/processed_train.csv')
df_val = pd.read_csv('../data/cleaned/processed_validation.csv')
df_kaggle_test = pd.read_csv('../data/cleaned/processed_kaggle_test.csv')

# PREPROCESSING

In [3]:
# Define Target and ID columns
target_col = "diagnosed_diabetes"
id_col = "id"

In [4]:
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]
X_val = df_val.drop(columns=[target_col])
y_val = df_val[target_col]

In [5]:
submission_ids = df_kaggle_test[id_col]
X_kaggle_test = df_kaggle_test.drop(columns=[id_col])
X_kaggle_test = X_kaggle_test[X_train.columns]

In [6]:
# Define the Best Models 

# Model A: Your Tuned AdaBoost
clf_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3), # From your best params
    n_estimators=200,
    learning_rate=1.0,
    random_state=42
)

# Model B: Tuned XGBoost (Use your best params from the previous search)
clf_xgb = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42
)

# Model C: Random Forest (Adds diversity because it builds trees differently)
clf_rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=5,
    n_jobs=-1,
    random_state=42
)

In [7]:
# Build Voting Classifier 
# 'voting="soft"' averages the probabilities (Best for AUC)
ensemble = VotingClassifier(
    estimators=[
        ('ada', clf_ada),
        ('xgb', clf_xgb),
        ('rf', clf_rf)
    ],
    voting='soft',
    weights=[1, 2, 1]  # Give more weight to your strongest model (e.g., XGB)
)

In [8]:
print("Training Ensemble Model...")
ensemble.fit(X_train, y_train)

Training Ensemble Model...


0,1,2
,estimators,"[('ada', ...), ('xgb', ...), ...]"
,voting,'soft'
,weights,"[1, 2, ...]"
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
# Evaluate 
val_probs = ensemble.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print(f"\nEnsemble ROC-AUC: {roc:.5f}")


Ensemble ROC-AUC: 0.72298


In [10]:
# Submission 
test_probs = ensemble.predict_proba(X_kaggle_test)[:, 1]

submission = pd.DataFrame({
    id_col: submission_ids,
    target_col: test_probs
})

In [11]:
submission.to_csv('../data/submission/submission_ensemble.csv', index=False)
print("Saved 'submission_ensemble.csv'")

Saved 'submission_ensemble.csv'


# TUNNING 

In [16]:
# Model 1: XGBoost (Strong baseline params)
clf_xgb = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42
)

# Model 2: AdaBoost (Using your tuned params: depth=3, lr=1.0)
clf_ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=200,
    learning_rate=1.0,
    random_state=42
)

# Model 3: LightGBM (The "King of Kaggle")
clf_lgb = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    objective='binary',
    metric='auc',
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

In [17]:
# Train Models Individually 

print("\nTraining XGBoost...")
clf_xgb.fit(X_train, y_train)

print("Training AdaBoost...")
clf_ada.fit(X_train, y_train)

print("Training LightGBM...")
clf_lgb.fit(X_train, y_train)


Training XGBoost...
Training AdaBoost...
Training LightGBM...


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [18]:
# Find Best Weights using Validation Set 
print("\nOptimizing Ensemble Weights...")

# Generate predictions for each model on the VALIDATION set
p_xgb = clf_xgb.predict_proba(X_val)[:, 1]
p_ada = clf_ada.predict_proba(X_val)[:, 1]
p_lgb = clf_lgb.predict_proba(X_val)[:, 1]

best_score = 0
best_weights = (1, 1, 1)


Optimizing Ensemble Weights...


In [19]:
# Grid search for weights
# We check combinations like (1, 2, 1), (3, 1, 5), etc.
weights_to_test = [1, 2, 3, 4, 5]

In [20]:
for w1 in weights_to_test:
    for w2 in weights_to_test:
        for w3 in weights_to_test:
            # Calculate weighted average
            avg_pred = (w1 * p_xgb + w2 * p_ada + w3 * p_lgb) / (w1 + w2 + w3)
            
            # Score it
            score = roc_auc_score(y_val, avg_pred)
            
            if score > best_score:
                best_score = score
                best_weights = (w1, w2, w3)

In [21]:
print(f"\n--- Optimization Results ---")
print(f"Best Weights Found: XGB={best_weights[0]}, Ada={best_weights[1]}, LGB={best_weights[2]}")
print(f"Validation ROC-AUC with Optimized Weights: {best_score:.5f}")


--- Optimization Results ---
Best Weights Found: XGB=3, Ada=1, LGB=5
Validation ROC-AUC with Optimized Weights: 0.72623


In [22]:
# Apply to Test Set and Submit
print("\nGenerating final predictions...")

# Get predictions on Kaggle Test Set
t_xgb = clf_xgb.predict_proba(X_kaggle_test)[:, 1]
t_ada = clf_ada.predict_proba(X_kaggle_test)[:, 1]
t_lgb = clf_lgb.predict_proba(X_kaggle_test)[:, 1]


Generating final predictions...


In [23]:
# Apply the BEST weights we found
w1, w2, w3 = best_weights
final_preds = (w1 * t_xgb + w2 * t_ada + w3 * t_lgb) / (w1 + w2 + w3)

In [24]:
# Create Submission DataFrame
submission = pd.DataFrame({
    id_col: submission_ids,
    target_col: final_preds
})

In [25]:
# Save
submission.to_csv('../data/submission/submission_optimized_ensemble.csv', index=False)
print("Success! 'submission_optimized_ensemble.csv' saved.")

Success! 'submission_optimized_ensemble.csv' saved.
