## RandomForest

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from skopt import BayesSearchCV

In [22]:
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [3]:
train_df.shape

(700000, 26)

In [4]:
print(train_df['diagnosed_diabetes'].value_counts())
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

diagnosed_diabetes
1.0    436307
0.0    263693
Name: count, dtype: int64
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


In [3]:
cat_cols = train_df.select_dtypes(include='object').columns
cat_cols

Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')

In [4]:
for c in ['age','bmi','systolic_bp','diastolic_bp','sleep_hours_per_day','alcohol_consumption_per_week']:
    if c in train_df.columns:
        print(f"\n{c} - min/max/median: ", train_df[c].min(), train_df[c].max(), train_df[c].median())



age - min/max/median:  19 89 50.0

bmi - min/max/median:  15.1 38.4 25.9

systolic_bp - min/max/median:  91 163 116.0

diastolic_bp - min/max/median:  51 104 75.0

sleep_hours_per_day - min/max/median:  3.1 9.9 7.0

alcohol_consumption_per_week - min/max/median:  1 9 2.0


In [7]:
print("\nDuplicate ids in train:", train_df['id'].duplicated().sum())


Duplicate ids in train: 0


In [None]:
# 1. Working copies (safe practice)
train = train_df.copy()

# 2. Separate target variable
y = train["diagnosed_diabetes"]
X = train.drop(columns=["diagnosed_diabetes", "id"])
test = test_df.drop(columns=["id"])

In [6]:
nominal_cols = ['gender', 'ethnicity', 'employment_status']
ordinal_cols = ['education_level', 'income_level', 'smoking_status']

# Define the order for ordinal features
education_order = ["No formal", "Highschool", "Graduate", "Postgraduate"]  
income_order    = ["Low", "Lower-Middle", "Middle", "Upper-Middle", "High"]
smoking_order   = ["Never", "Current", "Former"]

In [20]:
# Nominal -> OneHot (binary indicators). Creates a new column for each category.
nominal_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Ordinal -> integer encoding with defined order. Assigns integer values based on order.
ordinal_transformer = OrdinalEncoder(
    categories=[education_order, income_order, smoking_order],
    dtype=int
)

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_transformer, nominal_cols),
        ("ordinal", ordinal_transformer, ordinal_cols)
    ],
    remainder="passthrough" # keeps continuous columns as is. Numeric columns (passthrough) → converted to NumPy array
)

In [9]:
X_prepared = preprocessor.fit_transform(X)

In [12]:
# Single split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y, # preserving target class distribution in train and val sets
    random_state=42
)

In [None]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2, # A node must have at least 2 samples to split.
        min_samples_leaf=1,
        max_features="sqrt", # Standard for classification; reduces correlation between trees.
        bootstrap=True, # Sample with replacement
        class_weight="balanced", # Helps handle class imbalance, improves sensitivity to minority class.
        n_jobs=-1,
        random_state=42
    ))
])

In [None]:
rf_pipeline.fit(X_train, y_train)

In [52]:
val_preds = rf_pipeline.predict_proba(X_val)[:, 1] 
# returns probability estimates for each class.
# We take column [:, 1] → probability of positive class (diabetes).
# Random Forest probability = average of leaf probabilities from all trees.

In [None]:
val_auc = roc_auc_score(y_val, val_preds)  # A single train/validation split can be unstable
print("Validation ROC-AUC (a baseline performance):", val_auc) 

Validation ROC-AUC (a baseline performance): 0.6976153313173052


A single train/validation split can be unstable

CV reduces randomness

CV helps detect overfitting

CV helps find the best hyperparameters more reliably

In [10]:
# 'blank' RandomForestClassifier, hyperparameters will be supplied by RandomizedSearchCV
rf = RandomForestClassifier(random_state=42, n_jobs=1)

In [11]:
# Each value is a *list of possible values* that the search will sample from.
# RandomizedSearchCV will randomly choose combinations from this space.
parameters = {
    "n_estimators": [400, 600, 800, 1000], # Number of trees in the forest. More trees = Lower variance, but higher computational cost.
    "max_depth": [10, 20, 30, 40, 50], # Maximum depth of each tree. Limits how much the tree can grow, avoids overfitting!
    "min_samples_split": [2, 5, 10, 20], # Minimum number of samples required to split a node. Higher values = more conservative trees -> Less overfitting.
    "min_samples_leaf": [1, 2, 4, 8], # Minimum number of samples needed at a leaf node. Higher values smooth the model and reduce noise fitting.
    "max_features": ['sqrt', 'log2', None], # Number of features considered when deciding a split. "sqrt" → faster, adds randomness → helps prevent overfitting. "log2" → even fewer features.
    "bootstrap": [True, False] # Whether bootstrap sampling is used for trees. Usually bootstrap=True works best; False trains on all rows.
}

In [14]:
# StratifiedKFold ensures each fold has the same class proportion as the original dataset → critical for imbalanced data.
cv = StratifiedKFold(
    n_splits=3,  # 3-fold CV → train on 80%, validate on 20% each fold
    shuffle=True,  # Shuffles data before splitting → reduces variance
    random_state=42
)

In [None]:
# RandomizedSearchCV:
# - Randomly samples N hyperparameter combinations
# - For each combination, performs cross-validation
# - Computes mean validation ROC-AUC for each combination
# - Returns the best parameters and best score

# Why "randomized" instead of full grid search?
# - Much faster
# - Often finds equally good or better solutions
# - Allows larger search spaces

bayes_search = BayesSearchCV(
    estimator=rf, # The model we want to tune
    search_spaces=parameters, # The hyperparameter search space
    n_iter=30, # Number of random combinations to try
    scoring='roc_auc', # Evaluation metric
    n_jobs=1, # Use all CPU cores
    cv=cv, # Cross-validation strategy
    verbose=2, # Print progress during fitting
    random_state=42 # For reproducibility
)

In [None]:
X_train_prepared = preprocessor.fit_transform(X)
y_train = y

sample_size = 150000

X_sample = X_train_prepared[:sample_size]
y_sample = y_train[:sample_size]

In [None]:
# Fit the search object on the FULL TRAINING DATA (X_train, y_train)
# Note: we do NOT include validation data here — CV handles the splitting.
# This step:
# - samples hyperparameters
# - trains the model ~30 times (one for each combination)
# - each of those uses 5-fold CV → ~150 total model fits
# - selects the best combination based on average ROC-AUC
bayes_search.fit(X_sample, y_sample)

In [None]:
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.datasets import make_classification

In [14]:
def objective(trial):
    """
    Optuna calls this function for each trial (iteration). 
    It suggests hyperparameters, trains the RF model with CV, 
    and returns the score to be maximized/minimized.
    """
    
    # Define the search space using Optuna's suggest methods
    n_estimators = trial.suggest_int('n_estimators', 100, 1500)
    max_depth = trial.suggest_int('max_depth', 5, 70)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Create the Random Forest Classifier with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        n_jobs=7,          # Use all available cores for speed during training
        random_state=42
    )

    # Use cross_val_score to evaluate the model
    # The scoring here is 'roc_auc' as requested in your previous setup
    score = cross_val_score(
        model, 
        X_sample, 
        y_sample, 
        cv=3,               # Using cv=3 for faster iterations
        scoring='roc_auc', 
        n_jobs=1           # Parallelize CV fits if possible
    ).mean() # Return the average CV score

    return score


In [15]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))

[I 2025-12-06 15:55:54,021] A new study created in memory with name: no-name-3a726ac9-b487-4d49-be40-ccfc87918a58


In [None]:
# n_trials is the total number of iterations you want to run (e.g., 30 intelligent rounds)
print("Starting Optuna optimization...")
study.optimize(objective, n_trials=30, show_progress_bar=True)

In [17]:
print("\nOptimization finished.")
print(f"Best trial value (ROC AUC): {study.best_value:.4f}")
print("Best hyperparameters found:")
print(study.best_params)


Optimization finished.
Best trial value (ROC AUC): 0.7042
Best hyperparameters found:
{'n_estimators': 729, 'max_depth': 63, 'min_samples_split': 4, 'min_samples_leaf': 20, 'max_features': None, 'bootstrap': True}


In [32]:
# 1. Working copies (safe practice)
train = train_df.copy()

# 2. Separate target variable
y = train["diagnosed_diabetes"]
X = train.drop(columns=["diagnosed_diabetes", "id"])
test = test_df.drop(columns=["id"])

In [33]:
X_train_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test)
y_train = y
rf_pipeline_final = Pipeline(steps=[
    #("preprocessor", preprocessor),
    ("rf", RandomForestClassifier(**study.best_params, n_jobs=-1, random_state=42))
])

In [None]:
rf_pipeline_final.fit(X_train_processed, y_train)

In [37]:
# Predict
y_pred = rf_pipeline_final.predict_proba(X_test_processed)[:, 1]

In [38]:
y_pred

array([0.51423813, 0.62525763, 0.72274039, ..., 0.46561524, 0.57963339,
       0.62738994])

In [39]:
# Save submission
rf_submission = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': y_pred
})
rf_submission.to_csv('rf_submission.csv', index=False)