In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [7]:
data = pd.read_csv('Titanic/train.csv')

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

missing_values = data.isnull().sum()

data_cleaned = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

data_cleaned['Age'].fillna(data_cleaned['Age'].median(), inplace=True)
data_cleaned['Embarked'].fillna(data_cleaned['Embarked'].mode()[0], inplace=True)

label_encoder = LabelEncoder()
data_cleaned['Sex'] = label_encoder.fit_transform(data_cleaned['Sex'])
data_cleaned['Embarked'] = label_encoder.fit_transform(data_cleaned['Embarked'])

X = data_cleaned.drop('Survived', axis=1)
y = data_cleaned['Survived']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Age'].fillna(data_cleaned['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned['Embarked'].fillna(data_cleaned['Embarked'].mode()[0], inplace=True)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
rf = RandomForestClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
nb = GaussianNB()

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}


rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_param_grid, n_iter=10, cv=3, verbose=1, random_state=42)
dt_random = RandomizedSearchCV(estimator=dt, param_distributions=dt_param_grid, n_iter=10, cv=3, verbose=1, random_state=42)

rf_random.fit(X_train, y_train)
dt_random.fit(X_train, y_train)
nb.fit(X_train, y_train)

rf_pred = rf_random.predict(X_test)
dt_pred = dt_random.predict(X_test)
nb_pred = nb.predict(X_test)


rf_acc = accuracy_score(y_test, rf_pred)
dt_acc = accuracy_score(y_test, dt_pred)
nb_acc = accuracy_score(y_test, nb_pred)

print(f'Naive Bayes Accuracy: {nb_acc:.4f}')
print(f'Random Forest Accuracy: {rf_acc:.4f}')
print(f'Decision Tree Accuracy: {dt_acc:.4f}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Naive Bayes Accuracy: 0.7765
Random Forest Accuracy: 0.8436
Decision Tree Accuracy: 0.8324
