In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV



In [28]:

df_titanic = pd.read_csv('../data/processed/Cleaned_Titanic_Dataset.csv', index_col= "S/N")
df_titanic.head(10)

Unnamed: 0_level_0,pclass,survived,sex,age,fare,embarked
S/N,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,1,female,29,211,S
1,1,1,male,1,152,S
2,1,0,female,2,152,S
3,1,0,male,30,152,S
4,1,0,female,25,152,S
5,1,1,male,48,27,S
6,1,1,female,63,78,S
7,1,0,male,39,0,S
8,1,1,female,53,51,S
9,1,0,male,71,50,C


In [5]:
df_titanic.isnull().sum()

pclass      0
survived    0
sex         0
age         0
fare        0
embarked    0
dtype: int64

In [29]:

# One-hot encoding for categorical variables
sex = pd.get_dummies(df_titanic['sex'], drop_first=True)
embark = pd.get_dummies(df_titanic['embarked'], drop_first=True)

# Concatenate the dummy variables
df_titanic = pd.concat([df_titanic, sex, embark], axis=1)

# Drop the original categorical columns if they exist
columns_to_drop = ['sex', 'embarked']
df_titanic = df_titanic.drop(columns=[col for col in columns_to_drop if col in df_titanic.columns])


In [30]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df_titanic.drop('survived', axis=1), df_titanic['survived'], test_size=0.30, random_state=101)

# Initialize the RandomForest model
rf_model = RandomForestClassifier(random_state=101)
rf_model.fit(X_train, y_train)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],           # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],             # Minimum number of samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for splitting
}

# Perform GridSearchCV with cross-validation to find the best hyperparameters
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=101),
             n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20, 30],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [50, 100, 150, 200]},
             scoring='accuracy', verbose=2)

In [31]:
# Print the best parameters and best cross-validation accuracy
print(f"Best Parameters: {grid_search_rf.best_params_}")
print(f"Best Cross-validation Accuracy: {grid_search_rf.best_score_}")

# Get the best model from the grid search
best_rf_model = grid_search_rf.best_estimator_

# Make predictions using the best model
rf_predictions = best_rf_model.predict(X_test)

# Evaluate the best model on the test set
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

print(f"Accuracy: {accuracy_score(y_test, rf_predictions)}")

print("Classification Report:")
print(classification_report(y_test, rf_predictions))

Best Parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-validation Accuracy: 0.7918558651530788
Confusion Matrix:
[[149  34]
 [ 52  93]]
Accuracy: 0.7378048780487805
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.81      0.78       183
           1       0.73      0.64      0.68       145

    accuracy                           0.74       328
   macro avg       0.74      0.73      0.73       328
weighted avg       0.74      0.74      0.74       328

