In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


In [2]:
# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Select features
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
df.dropna(inplace=True)

# Encode categorical variables
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Embarked'] = le.fit_transform(df['Embarked'])

X = df.drop('Survived', axis=1)
y = df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, pred))



Baseline Accuracy: 0.7692307692307693


In [4]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


Best Parameters: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50}
Best CV Accuracy: 0.8136935258500232


In [5]:
best_grid_model = grid.best_estimator_
grid_pred = best_grid_model.predict(X_test)

print("Grid Search Test Accuracy:", accuracy_score(y_test, grid_pred))


Grid Search Test Accuracy: 0.8041958041958042


In [6]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': np.arange(50, 400),
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=30,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best Random Parameters:", random_search.best_params_)
print("Best Random CV Accuracy:", random_search.best_score_)


Best Random Parameters: {'n_estimators': 322, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': 10}
Best Random CV Accuracy: 0.8119546654246236


In [7]:
pip install optuna


Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/75/d1/6c8a4fbb38a9e3565f5c36b871262a85ecab3da48120af036b1e4937a15c/optuna-4.7.0-py3-none-any.whl.metadata
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Obtaining dependency information for alembic>=1.5.0 from https://files.pythonhosted.org/packages/83/36/cd9cb6101e81e39076b2fbe303bfa3c85ca34e55142b0324fcbf22c5c6e2/alembic-1.18.1-py3-none-any.whl.metadata
  Downloading alembic-1.18.1-py3-none-any.whl.metadata (7.2 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Obtaining dependency information for Mako from https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl.metadata
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
   ---------------------------------------- 0.0/413.9 kB ? et

In [8]:
import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )

    score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    return score.mean()


In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)



[32m[I 2026-01-26 02:07:18,388][0m A new study created in memory with name: no-name-5dee5394-5f51-477e-b3e4-7d855a77de62[0m
[32m[I 2026-01-26 02:07:20,093][0m Trial 0 finished with value: 0.804890544946437 and parameters: {'n_estimators': 83, 'max_depth': 20, 'min_samples_split': 6}. Best is trial 0 with value: 0.804890544946437.[0m
[32m[I 2026-01-26 02:07:21,157][0m Trial 1 finished with value: 0.7925787921130258 and parameters: {'n_estimators': 55, 'max_depth': 18, 'min_samples_split': 5}. Best is trial 0 with value: 0.804890544946437.[0m
[32m[I 2026-01-26 02:07:29,107][0m Trial 2 finished with value: 0.8083993168762614 and parameters: {'n_estimators': 258, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 2 with value: 0.8083993168762614.[0m
[32m[I 2026-01-26 02:07:36,275][0m Trial 3 finished with value: 0.7925943176525384 and parameters: {'n_estimators': 291, 'max_depth': 20, 'min_samples_split': 2}. Best is trial 2 with value: 0.8083993168762614.[0m
[32m[I 20

Best Parameters: {'n_estimators': 178, 'max_depth': 11, 'min_samples_split': 10}
Best Accuracy: 0.8190032603632975


In [12]:
best_model = RandomForestClassifier(**study.best_params, random_state=42)
best_model.fit(X_train, y_train)

opt_pred = best_model.predict(X_test)
print("Bayesian Test Accuracy:", accuracy_score(y_test, opt_pred))


Bayesian Test Accuracy: 0.7902097902097902
