## RandomForest

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from skopt import BayesSearchCV

In [2]:
train_df = pd.read_csv("dataset/train.csv")

In [3]:
train_df.shape

(700000, 26)

In [4]:
print(train_df['diagnosed_diabetes'].value_counts())
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

diagnosed_diabetes
1.0    436307
0.0    263693
Name: count, dtype: int64
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


In [5]:
cat_cols = train_df.select_dtypes(include='object').columns
cat_cols

Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')

In [6]:
for c in ['age','bmi','systolic_bp','diastolic_bp','sleep_hours_per_day','alcohol_consumption_per_week']:
    if c in train_df.columns:
        print(f"\n{c} - min/max/median: ", train_df[c].min(), train_df[c].max(), train_df[c].median())



age - min/max/median:  19 89 50.0

bmi - min/max/median:  15.1 38.4 25.9

systolic_bp - min/max/median:  91 163 116.0

diastolic_bp - min/max/median:  51 104 75.0

sleep_hours_per_day - min/max/median:  3.1 9.9 7.0

alcohol_consumption_per_week - min/max/median:  1 9 2.0


In [7]:
print("\nDuplicate ids in train:", train_df['id'].duplicated().sum())


Duplicate ids in train: 0


In [8]:
# 1. Working copies (safe practice)
train = train_df.copy()

# 2. Separate target variable
y = train["diagnosed_diabetes"]
X = train.drop(columns=["diagnosed_diabetes", "id"])

In [9]:
nominal_cols = ['gender', 'ethnicity', 'employment_status']
ordinal_cols = ['education_level', 'income_level', 'smoking_status']

# Define the order for ordinal features
education_order = ["No formal", "Highschool", "Graduate", "Postgraduate"]  
income_order    = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"]
smoking_order   = ["Never", "Current", "Former"]

In [10]:
# Nominal -> OneHot (binary indicators). Creates a new column for each category.
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Ordinal -> integer encoding with defined order. Assigns integer values based on order.
ordinal_transformer = OrdinalEncoder(
    categories=[education_order, income_order, smoking_order],
    dtype=int
)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_transformer, nominal_cols),
        ("ordinal", ordinal_transformer, ordinal_cols)
    ],
    remainder="passthrough" # keeps continuous columns as is. Numeric columns (passthrough) → converted to NumPy array
)

In [57]:
X_prepared = preprocessor.fit_transform(X)

In [12]:
# Single split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y, # preserving target class distribution in train and val sets
    random_state=42
)

In [None]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2, # A node must have at least 2 samples to split.
        min_samples_leaf=1,
        max_features="sqrt", # Standard for classification; reduces correlation between trees.
        bootstrap=True, # Sample with replacement
        class_weight="balanced", # Helps handle class imbalance, improves sensitivity to minority class.
        n_jobs=-1,
        random_state=42
    ))
])

In [51]:
rf_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [52]:
val_preds = rf_pipeline.predict_proba(X_val)[:, 1] 
# returns probability estimates for each class.
# We take column [:, 1] → probability of positive class (diabetes).
# Random Forest probability = average of leaf probabilities from all trees.

In [None]:
val_auc = roc_auc_score(y_val, val_preds)  # A single train/validation split can be unstable
print("Validation ROC-AUC (a baseline performance):", val_auc) 

Validation ROC-AUC (a baseline performance): 0.6976153313173052


A single train/validation split can be unstable

CV reduces randomness

CV helps detect overfitting

CV helps find the best hyperparameters more reliably

In [46]:
# 'blank' RandomForestClassifier, hyperparameters will be supplied by RandomizedSearchCV
rf = RandomForestClassifier(random_state=42, n_jobs=1)

In [47]:
# Each value is a *list of possible values* that the search will sample from.
# RandomizedSearchCV will randomly choose combinations from this space.
parameters = {
    "n_estimators": [400, 600, 800, 1000], # Number of trees in the forest. More trees = Lower variance, but higher computational cost.
    "max_depth": [10, 20, 30, 40, 50], # Maximum depth of each tree. Limits how much the tree can grow, avoids overfitting!
    "min_samples_split": [2, 5, 10, 20], # Minimum number of samples required to split a node. Higher values = more conservative trees -> Less overfitting.
    "min_samples_leaf": [1, 2, 4, 8], # Minimum number of samples needed at a leaf node. Higher values smooth the model and reduce noise fitting.
    "max_features": ['sqrt', 'log2', None], # Number of features considered when deciding a split. "sqrt" → faster, adds randomness → helps prevent overfitting. "log2" → even fewer features.
    "bootstrap": [True, False] # Whether bootstrap sampling is used for trees. Usually bootstrap=True works best; False trains on all rows.
}

In [48]:
# StratifiedKFold ensures each fold has the same class proportion as the original dataset → critical for imbalanced data.
cv = StratifiedKFold(
    n_splits=3,  # 3-fold CV → train on 80%, validate on 20% each fold
    shuffle=True,  # Shuffles data before splitting → reduces variance
    random_state=42
)

In [49]:
# RandomizedSearchCV:
# - Randomly samples N hyperparameter combinations
# - For each combination, performs cross-validation
# - Computes mean validation ROC-AUC for each combination
# - Returns the best parameters and best score

# Why "randomized" instead of full grid search?
# - Much faster
# - Often finds equally good or better solutions
# - Allows larger search spaces

bayes_search = BayesSearchCV(
    estimator=rf, # The model we want to tune
    search_spaces=parameters, # The hyperparameter search space
    n_iter=20, # Number of random combinations to try
    scoring='roc_auc', # Evaluation metric
    n_jobs=1, # Use all CPU cores
    cv=cv, # Cross-validation strategy
    verbose=2, # Print progress during fitting
    random_state=42 # For reproducibility
)

In [62]:
X_train_prepared = preprocessor.fit_transform(X)
y_train = y

sample_size = 100000

X_sample = X_train_prepared[:sample_size]
y_sample = y_train[:sample_size]

In [None]:
# Fit the search object on the FULL TRAINING DATA (X_train, y_train)
# Note: we do NOT include validation data here — CV handles the splitting.
# This step:
# - samples hyperparameters
# - trains the model ~30 times (one for each combination)
# - each of those uses 5-fold CV → ~150 total model fits
# - selects the best combination based on average ROC-AUC
bayes_search.fit(X_sample, y_sample)

In [52]:
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_classification

  from .autonotebook import tqdm as notebook_tqdm


In [63]:
def objective(trial):
    """
    Optuna calls this function for each trial (iteration). 
    It suggests hyperparameters, trains the RF model with CV, 
    and returns the score to be maximized/minimized.
    """
    
    # Define the search space using Optuna's suggest methods
    n_estimators = trial.suggest_int('n_estimators', 200, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 8)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Create the Random Forest Classifier with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        n_jobs=-1,          # Use all available cores for speed during training
        random_state=42
    )

    # Use cross_val_score to evaluate the model
    # The scoring here is 'roc_auc' as requested in your previous setup
    score = cross_val_score(
        model, 
        X_sample, 
        y_sample, 
        cv=3,               # Using cv=3 for faster iterations
        scoring='roc_auc', 
        n_jobs=-1           # Parallelize CV fits if possible
    ).mean() # Return the average CV score

    return score


In [64]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))

[I 2025-12-05 16:04:04,590] A new study created in memory with name: no-name-0d1401dc-c0eb-4634-a4fb-94b61523fc2d


In [65]:
# n_trials is the total number of iterations you want to run (e.g., 30 intelligent rounds)
print("Starting Optuna optimization...")
study.optimize(objective, n_trials=30, show_progress_bar=True)

Starting Optuna optimization...


Best trial: 0. Best value: 0.69735:   3%|▎         | 1/30 [00:09<04:28,  9.27s/it]

[I 2025-12-05 16:04:16,073] Trial 0 finished with value: 0.697349670163074 and parameters: {'n_estimators': 500, 'max_depth': 48, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.697349670163074.


Best trial: 0. Best value: 0.69735:   7%|▋         | 2/30 [00:19<04:36,  9.89s/it]

[I 2025-12-05 16:04:26,390] Trial 1 finished with value: 0.696396749513437 and parameters: {'n_estimators': 767, 'max_depth': 10, 'min_samples_split': 20, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 0.697349670163074.


Best trial: 0. Best value: 0.69735:  10%|█         | 3/30 [01:15<13:58, 31.06s/it]

[I 2025-12-05 16:05:22,654] Trial 2 finished with value: 0.6957329780852218 and parameters: {'n_estimators': 545, 'max_depth': 21, 'min_samples_split': 13, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.697349670163074.


Best trial: 0. Best value: 0.69735:  13%|█▎        | 4/30 [02:33<21:26, 49.48s/it]

[I 2025-12-05 16:06:40,359] Trial 3 finished with value: 0.6970992071615157 and parameters: {'n_estimators': 611, 'max_depth': 34, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.697349670163074.


Best trial: 4. Best value: 0.697966:  17%|█▋        | 5/30 [02:39<14:08, 33.93s/it]

[I 2025-12-05 16:06:46,733] Trial 4 finished with value: 0.6979657390760318 and parameters: {'n_estimators': 443, 'max_depth': 14, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 4 with value: 0.6979657390760318.


Best trial: 5. Best value: 0.697996:  20%|██        | 6/30 [02:52<10:37, 26.54s/it]

[I 2025-12-05 16:06:58,933] Trial 5 finished with value: 0.6979963222445157 and parameters: {'n_estimators': 730, 'max_depth': 22, 'min_samples_split': 11, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  23%|██▎       | 7/30 [04:25<18:29, 48.24s/it]

[I 2025-12-05 16:08:31,836] Trial 6 finished with value: 0.580360923031649 and parameters: {'n_estimators': 678, 'max_depth': 47, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': False}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  27%|██▋       | 8/30 [05:04<16:41, 45.54s/it]

[I 2025-12-05 16:09:11,584] Trial 7 finished with value: 0.6954982857257281 and parameters: {'n_estimators': 485, 'max_depth': 21, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': True}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  30%|███       | 9/30 [05:09<11:28, 32.81s/it]

[I 2025-12-05 16:09:16,410] Trial 8 finished with value: 0.6960181194018098 and parameters: {'n_estimators': 204, 'max_depth': 43, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  33%|███▎      | 10/30 [05:21<08:47, 26.38s/it]

[I 2025-12-05 16:09:28,408] Trial 9 finished with value: 0.6969152872429946 and parameters: {'n_estimators': 699, 'max_depth': 23, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  37%|███▋      | 11/30 [05:42<07:49, 24.72s/it]

[I 2025-12-05 16:09:49,359] Trial 10 finished with value: 0.6978903393526221 and parameters: {'n_estimators': 971, 'max_depth': 33, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': False}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 5. Best value: 0.697996:  40%|████      | 12/30 [05:45<05:27, 18.18s/it]

[I 2025-12-05 16:09:52,575] Trial 11 finished with value: 0.6964228893598783 and parameters: {'n_estimators': 308, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 5 with value: 0.6979963222445157.


Best trial: 12. Best value: 0.698195:  43%|████▎     | 13/30 [05:57<04:33, 16.10s/it]

[I 2025-12-05 16:10:03,895] Trial 12 finished with value: 0.6981948732115896 and parameters: {'n_estimators': 839, 'max_depth': 16, 'min_samples_split': 20, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True}. Best is trial 12 with value: 0.6981948732115896.


Best trial: 13. Best value: 0.698309:  47%|████▋     | 14/30 [06:10<04:05, 15.33s/it]

[I 2025-12-05 16:10:17,454] Trial 13 finished with value: 0.6983092010460933 and parameters: {'n_estimators': 885, 'max_depth': 26, 'min_samples_split': 19, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 13 with value: 0.6983092010460933.


Best trial: 13. Best value: 0.698309:  50%|█████     | 15/30 [06:30<04:11, 16.75s/it]

[I 2025-12-05 16:10:37,491] Trial 14 finished with value: 0.6980250064473403 and parameters: {'n_estimators': 908, 'max_depth': 28, 'min_samples_split': 20, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 13 with value: 0.6983092010460933.


Best trial: 13. Best value: 0.698309:  53%|█████▎    | 16/30 [06:59<04:45, 20.42s/it]

[I 2025-12-05 16:11:06,442] Trial 15 finished with value: 0.6982810249000225 and parameters: {'n_estimators': 848, 'max_depth': 15, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 13 with value: 0.6983092010460933.


Best trial: 16. Best value: 0.698336:  57%|█████▋    | 17/30 [07:40<05:46, 26.62s/it]

[I 2025-12-05 16:11:47,466] Trial 16 finished with value: 0.6983362181030405 and parameters: {'n_estimators': 990, 'max_depth': 30, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 16 with value: 0.6983362181030405.


Best trial: 16. Best value: 0.698336:  60%|██████    | 18/30 [08:08<05:23, 26.99s/it]

[I 2025-12-05 16:12:15,339] Trial 17 finished with value: 0.6980398547130474 and parameters: {'n_estimators': 986, 'max_depth': 38, 'min_samples_split': 17, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': True}. Best is trial 16 with value: 0.6983362181030405.


Best trial: 16. Best value: 0.698336:  63%|██████▎   | 19/30 [08:28<04:32, 24.77s/it]

[I 2025-12-05 16:12:34,928] Trial 18 finished with value: 0.6977613766891411 and parameters: {'n_estimators': 882, 'max_depth': 28, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': False}. Best is trial 16 with value: 0.6983362181030405.


Best trial: 16. Best value: 0.698336:  67%|██████▋   | 20/30 [08:50<04:01, 24.13s/it]

[I 2025-12-05 16:12:57,554] Trial 19 finished with value: 0.6981768495785218 and parameters: {'n_estimators': 803, 'max_depth': 38, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 16 with value: 0.6983362181030405.


Best trial: 16. Best value: 0.698336:  70%|███████   | 21/30 [09:34<04:29, 29.98s/it]

[I 2025-12-05 16:13:41,181] Trial 20 finished with value: 0.6968664154548939 and parameters: {'n_estimators': 997, 'max_depth': 26, 'min_samples_split': 18, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True}. Best is trial 16 with value: 0.6983362181030405.


Best trial: 21. Best value: 0.698344:  73%|███████▎  | 22/30 [09:46<03:17, 24.70s/it]

[I 2025-12-05 16:13:53,564] Trial 21 finished with value: 0.6983443084546233 and parameters: {'n_estimators': 902, 'max_depth': 17, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 21 with value: 0.6983443084546233.


Best trial: 22. Best value: 0.698449:  77%|███████▋  | 23/30 [10:06<02:41, 23.11s/it]

[I 2025-12-05 16:14:12,975] Trial 22 finished with value: 0.698448794501331 and parameters: {'n_estimators': 909, 'max_depth': 18, 'min_samples_split': 18, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  80%|████████  | 24/30 [10:41<02:40, 26.83s/it]

[I 2025-12-05 16:14:48,479] Trial 23 finished with value: 0.6982628576840016 and parameters: {'n_estimators': 942, 'max_depth': 18, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  83%|████████▎ | 25/30 [11:15<02:24, 28.96s/it]

[I 2025-12-05 16:15:22,403] Trial 24 finished with value: 0.698083461902057 and parameters: {'n_estimators': 801, 'max_depth': 32, 'min_samples_split': 13, 'min_samples_leaf': 6, 'max_features': 'log2', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  87%|████████▋ | 26/30 [11:51<02:04, 31.04s/it]

[I 2025-12-05 16:15:58,309] Trial 25 finished with value: 0.6981483098391573 and parameters: {'n_estimators': 929, 'max_depth': 18, 'min_samples_split': 14, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  90%|█████████ | 27/30 [12:19<01:30, 30.26s/it]

[I 2025-12-05 16:16:26,733] Trial 26 finished with value: 0.6977444952407561 and parameters: {'n_estimators': 662, 'max_depth': 13, 'min_samples_split': 18, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': False}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  93%|█████████▎| 28/30 [12:54<01:03, 31.53s/it]

[I 2025-12-05 16:17:01,242] Trial 27 finished with value: 0.6983611531857763 and parameters: {'n_estimators': 843, 'max_depth': 38, 'min_samples_split': 16, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449:  97%|█████████▋| 29/30 [15:28<01:08, 68.34s/it]

[I 2025-12-05 16:19:35,452] Trial 28 finished with value: 0.6981662536623494 and parameters: {'n_estimators': 747, 'max_depth': 36, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': None, 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.


Best trial: 22. Best value: 0.698449: 100%|██████████| 30/30 [16:03<00:00, 32.12s/it]

[I 2025-12-05 16:20:10,292] Trial 29 finished with value: 0.6980761853386896 and parameters: {'n_estimators': 848, 'max_depth': 44, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 22 with value: 0.698448794501331.



