# Baseline Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, fbeta_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df = pd.read_csv('../data/processed/train_processed.csv')
X = df.drop('outcome', axis=1)
y = df.outcome

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [12]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.73      0.71      0.72       526
         1.0       0.70      0.72      0.71       508

    accuracy                           0.71      1034
   macro avg       0.71      0.71      0.71      1034
weighted avg       0.72      0.71      0.71      1034



# Hyperparameters Optimization

In [13]:
def print_all_scores(y_true, y_pred):
    print('Accuracy_score:', accuracy_score(y_true, y_pred))
    print('Precision_score:', precision_score(y_true, y_pred))
    print('Recall_score:', recall_score(y_true, y_pred))
    print('F1_score:', f1_score(y_true, y_pred))
    print('ROC_score:', roc_auc_score(y_true, y_pred))

In [14]:
import optuna
from optuna.samplers import TPESampler

In [15]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 40),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ['sqrt', "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
    }
    
    model = RandomForestClassifier(**params, random_state=123)
    score = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
    return score

In [16]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=123))
study.optimize(objective, n_trials = 100)
print("Best params ", study.best_params)
print("Best score ", study.best_value)

[I 2025-11-28 17:33:31,648] A new study created in memory with name: no-name-08a94952-cc11-4aae-bdee-5e1792e9fafe
[I 2025-11-28 17:33:59,379] Trial 0 finished with value: 0.7126225203431117 and parameters: {'n_estimators': 364, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.7126225203431117.
[I 2025-11-28 17:34:17,519] Trial 1 finished with value: 0.7111880062620166 and parameters: {'n_estimators': 226, 'max_depth': 16, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.7126225203431117.
[I 2025-11-28 17:34:26,388] Trial 2 finished with value: 0.7145675420585265 and parameters: {'n_estimators': 289, 'max_depth': 23, 'min_samples_split': 14, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 2 with value: 0.7145675420585265.
[I 2025-11-28 17:34:40,217] Trial 3 finished with value: 0.7116461364446772 a

Best params  {'n_estimators': 364, 'max_depth': 10, 'min_samples_split': 19, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}
Best score  0.7172762570598474


In [17]:
best_params = {
    "n_estimators": 276,
    "max_depth": 13,
    "min_samples_split": 12,
    "min_samples_leaf": 9,
    "max_features": "sqrt",
    "bootstrap": True
}

In [18]:
model = RandomForestClassifier(**best_params, random_state=123)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print_all_scores(y_test, y_pred)

Accuracy_score: 0.706963249516441
Precision_score: 0.697495183044316
Recall_score: 0.7125984251968503
F1_score: 0.7049659201557936
ROC_score: 0.7070596688721895
