In [61]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [62]:
titanic = sns.load_dataset('titanic')

In [63]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [64]:
def preprocess_titanic(df):
    # Fill missing age values with median
    df['age'].fillna(df['age'].median(), inplace=True)

    # sex binary
    df['sex'] = [1 if 'Male' else 0 for x in df['sex']]

    # Fill missing embarked with 'S' - the most frequent value
    df = df.drop('embarked', axis=1)

    # Select features and target
    X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'alone']]
    y = df['survived']
    return X, y


X, y = preprocess_titanic(titanic)

In [65]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,alone
0,3,1,22.0,1,0,7.25,False
1,1,1,38.0,1,0,71.2833,False
2,3,1,26.0,0,0,7.925,True
3,1,1,35.0,1,0,53.1,False
4,3,1,35.0,0,0,8.05,True


In [66]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41892)

In [67]:
# Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=598)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

In [68]:
# XGBoost model
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)

In [69]:
# Stacking model with Decision Trees and Logistic Regression
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=4325)),
    ('dt', DecisionTreeClassifier(random_state=52353))
]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_clf.fit(X_train, y_train)
stacking_predictions = stacking_clf.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_predictions)

In [70]:
print(
    f" Random Forest Accuracy: {rf_accuracy}\n",
    f"XGBoost Accuracy {xgb_accuracy}\n",
    f"Stacking Accuracy {stacking_accuracy}\n")


 Random Forest Accuracy: 0.6828358208955224
 XGBoost Accuracy 0.6567164179104478
 Stacking Accuracy 0.6865671641791045



# Let's Do It Again with Hyperparameters

In [78]:
# Setting up hyperparameter distributions for the randomized search
rf_param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'bootstrap': [True, False]
}

xgb_param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': np.logspace(-3, 0, 10),
    'subsample': np.linspace(0.7, 1.0, 4),
    'colsample_bytree': np.linspace(0.5, 1.0, 3),
}

# Define a smaller range of hyperparameters for randomized search
rf_param_dist = {
    'n_estimators': randint(10, 50),
    'max_depth': randint(1, 4),
    'min_samples_split': randint(2, 4),
    'min_samples_leaf': randint(1, 4),
    'bootstrap': [True, False]
}

n_iter_search = 30

In [89]:
# Random Forest with Parameters
random_search_rf = RandomizedSearchCV(rf,
                                      param_distributions=rf_param_dist,
                                      n_iter=n_iter_search,
                                      cv=5,
                                      random_state=432)
random_search_rf.fit(X_train, y_train)

# After fitting, access the best score and parameters
best_rf_score = random_search_rf.best_score_
best_rf_params = random_search_rf.best_params_

In [92]:
# XGBoost with Parameters
random_search_xgb = RandomizedSearchCV(xgb,
                                       param_distributions=xgb_param_dist,
                                       n_iter=n_iter_search,
                                       cv=5,
                                       random_state=432,
                                       verbose=1)

# Fit the randomized search to the data
random_search_xgb.fit(X_train, y_train)

# Now you can access the best score and parameters
best_xgb_score = random_search_xgb.best_score_
best_xgb_params = random_search_xgb.best_params_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


### Using Random Forest Parameters for the Stacked Model

In [93]:
# Stacking Model with Parameters

best_rf_score = random_search_rf.best_score_
best_rf_params = random_search_rf.best_params_

# Stacking model with best Random Forest and Decision Tree with default parameters
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(**best_rf_params)),
        ('dt', DecisionTreeClassifier(random_state=98432))
    ],
    final_estimator=LogisticRegression()
)

# Fit the stacking model
stacking_clf.fit(X_train, y_train)

# Predict and calculate accuracy
stacking_predictions = stacking_clf.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_predictions)

In [95]:
print(
    f" Paramaterized Random Forest Accuracy: {best_rf_score}\n",
    f"Paramaterized XGBoost Accuracy {best_xgb_score}\n",
    f"Paramaterized Stacking Accuracy {stacking_accuracy}\n")

 Paramaterized Random Forest Accuracy: 0.7351225806451614
 Paramaterized XGBoost Accuracy 0.7207354838709678
 Paramaterized Stacking Accuracy 0.7014925373134329



### What if we combined both XBG and RandomForst?

In [96]:
# Stacking Model with Parameters from Random Forest and XGBoost

stacking_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(**best_rf_params)),
        ('xgb', XGBClassifier(**best_xgb_params)),
        ('dt', DecisionTreeClassifier(random_state=98432))
    ],
    final_estimator=LogisticRegression()
)

# Fit the stacking model
stacking_clf.fit(X_train, y_train)

# Predict and calculate accuracy
stacking_predictions = stacking_clf.predict(X_test)
stacking_accuracy = accuracy_score(y_test, stacking_predictions)


In [97]:
print(f"Paramaterized Stacking Accuracy {stacking_accuracy}\n")

Paramaterized Stacking Accuracy 0.6977611940298507



In [None]:
# Sometimes a single, simple model will perform best.
# Sometimes if the results fluctuate so much, it's a problem with generalizability!