# ADABOOST

# IMPORT LIBS

In [1]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

# IMPORTING DATA SET

In [7]:
# Load the Processed Data
df_train = pd.read_csv('../data/cleaned/processed_train_bin.csv')
df_val = pd.read_csv('../data/cleaned/processed_validation_bin.csv')
df_kaggle_test = pd.read_csv('../data/cleaned/processed_kaggle_test_bin.csv')

In [9]:
df_train.shape

(560000, 84)

# PREPROCESSING

In [10]:
# Define Target and ID columns
target_col = "diagnosed_diabetes"
id_col = "id"

In [11]:
# Prepare Training & Validation Sets
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

In [12]:
# Validation Set
X_val = df_val.drop(columns=[target_col])
y_val = df_val[target_col]

In [13]:
# Prepare Kaggle Submission Set
submission_ids = df_kaggle_test[id_col]
X_kaggle_test = df_kaggle_test.drop(columns=[id_col])

In [14]:
# SAFETY CHECK: Ensure columns match exactly
# The model will crash if the test columns are in a different order than train
X_kaggle_test = X_kaggle_test[X_train.columns]

In [15]:
print(f"Train Shape: {X_train.shape}")
print(f"Val Shape:   {X_val.shape}")
print(f"Test Shape:  {X_kaggle_test.shape}")

Train Shape: (560000, 83)
Val Shape:   (140000, 83)
Test Shape:  (300000, 83)


In [16]:
# Initialize and Train AdaBoost (Default Settings)
# Default AdaBoost uses 50 weak learners (Decision Stumps)
print("\nTraining AdaBoost (Base Model)...")
model = AdaBoostClassifier(
    n_estimators=50,  # Default
    learning_rate=1.0, # Default
    random_state=42
)


Training AdaBoost (Base Model)...


In [17]:
model.fit(X_train, y_train)

0,1,2
,estimator,
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,42


In [18]:
# Evaluate Performance (Accuracy)
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)

In [19]:
print("\n--- Validation Results ---")
print(f"Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, val_preds))


--- Validation Results ---
Accuracy: 0.6648

Classification Report:
              precision    recall  f1-score   support

         0.0       0.59      0.37      0.45     52739
         1.0       0.69      0.85      0.76     87261

    accuracy                           0.66    140000
   macro avg       0.64      0.61      0.61    140000
weighted avg       0.65      0.66      0.64    140000



# TUNNING 

In [20]:
# Define the Hyperparameter Grid 
param_dist = {
    'n_estimators': [50, 100, 200, 500],       # More trees often helps but is slower
    'learning_rate': [0.01, 0.1, 0.5, 1.0, 1.5], # Lower LR requires more estimators
    # Optional: Tune the base estimator's depth (Default is depth=1 "Stump")
    # Note: Increasing depth makes it more like Random Forest/Gradient Boosting
    'estimator': [
        DecisionTreeClassifier(max_depth=1), 
        DecisionTreeClassifier(max_depth=2),
        DecisionTreeClassifier(max_depth=3)
    ]
}

In [21]:
# Initialize Base Model
ada_model = AdaBoostClassifier(random_state=42)

In [22]:
# Setup Randomized Search with ROC-AUC
search = RandomizedSearchCV(
    estimator=ada_model,
    param_distributions=param_dist,
    n_iter=15,              # Try 15 combinations
    scoring='roc_auc',      # <--- Optimizing for AUC
    cv=3,
    verbose=3,              # Show progress
    random_state=42,
    n_jobs=-1               # Use all cores
)

In [23]:
print("Starting AdaBoost Hyperparameter Tuning (ROC-AUC)...")
search.fit(X_train, y_train)

Starting AdaBoost Hyperparameter Tuning (ROC-AUC)...
Fitting 3 folds for each of 15 candidates, totalling 45 fits


0,1,2
,estimator,AdaBoostClass...ndom_state=42)
,param_distributions,"{'estimator': [DecisionTreeC...r(max_depth=1), DecisionTreeC...r(max_depth=2), ...], 'learning_rate': [0.01, 0.1, ...], 'n_estimators': [50, 100, ...]}"
,n_iter,15
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,3
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
# Get Best Results 
print("\n--- Tuning Complete ---")
print(f"Best Parameters: {search.best_params_}")
print(f"Best CV ROC-AUC: {search.best_score_:.4f}")


--- Tuning Complete ---
Best Parameters: {'n_estimators': 200, 'learning_rate': 1.0, 'estimator': DecisionTreeClassifier(max_depth=3)}
Best CV ROC-AUC: 0.7159


In [25]:
best_model = search.best_estimator_

In [26]:
# --- 7. Validate on Hold-out Set ---
# Get probabilities for AUC calculation
val_probs = best_model.predict_proba(X_val)[:, 1]
roc_score = roc_auc_score(y_val, val_probs)

In [27]:
print(f"\nValidation ROC-AUC (Tuned Model): {roc_score:.4f}")


Validation ROC-AUC (Tuned Model): 0.7169


In [29]:
# Generate Submission 
test_preds = best_model.predict_proba(X_kaggle_test)[:, 1]

In [30]:
# Predict and Submit 
print("Generating predictions...")
final_probs = best_model.predict_proba(X_kaggle_test)[:, 1]

submission = pd.DataFrame({
    id_col: submission_ids,
    target_col: final_probs
})

Generating predictions...


In [31]:
# Submission AdaBoost
submission.to_csv('../data/submission/submission_adaboost_tuned_auc_bin.csv', index=False)
print("\nSuccess! 'submission_adaboost_tuned_auc.csv' saved.")


Success! 'submission_adaboost_tuned_auc.csv' saved.
