In [70]:
import pandas as pd
import os

PATH = "C:/Users/Ato/Documents/Programming/Python/Titanic/data/"
train_path = PATH + "train.csv"
test_path = os.path.join(PATH,"test.csv")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)


- We are going to use the classes with the highest pearson coeficients computed in the other notebook. **Pclass**, **Fare**, **Sex** and **Cabin**.

In [72]:
drop_features_train = ['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Embarked']
drop_features_test = ['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Embarked']

train['Sex'] = train['Sex'].apply(lambda x: 0 if x == 'male' else 1)
train['Cabin'] = train['Cabin'].fillna(0).apply(lambda x: 1 if x != 0 else 0)

test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'male' else 1)
test['Cabin'] = test['Cabin'].fillna(0).apply(lambda x: 1 if x != 0 else 0)

y = train['Survived'].copy()
X = train.drop(columns=drop_features_train).copy()

X_test = test.drop(columns=drop_features_test).copy()

- Dividing training and validation sets

In [54]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, random_state=42)  
X_train.shape, X_val.shape

((801, 5), (90, 5))

- Hyperparameter tunning with GridSearchCV

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

from sklearn.model_selection import GridSearchCV

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

In [56]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 180 candidates, totalling 720 fits
CPU times: total: 891 ms
Wall time: 19.2 s


In [57]:
grid_search.best_estimator_

- Model definition with tunned hyperparameters

In [58]:


model = RandomForestClassifier(random_state=42, n_jobs=1, max_depth = 10, n_estimators=30, min_samples_leaf=5, oob_score=True)

In [59]:
%%time
model.fit(X_train, y_train)

CPU times: total: 31.2 ms
Wall time: 54 ms


- Results

In [60]:
model.oob_score_

0.7940074906367042

In [61]:
y_val_pred = model.predict(X_val)
print(f'Model: {model}')

Model: RandomForestClassifier(max_depth=10, min_samples_leaf=5, n_estimators=30,
                       n_jobs=1, oob_score=True, random_state=42)


In [62]:
from sklearn.metrics import classification_report, confusion_matrix

conf_matrix = confusion_matrix(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

Confusion Matrix:
 [[46  8]
 [ 7 29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86        54
           1       0.78      0.81      0.79        36

    accuracy                           0.83        90
   macro avg       0.83      0.83      0.83        90
weighted avg       0.83      0.83      0.83        90



- Building dataset for Kaggle

In [75]:
y_pred = model.predict(X_test)

In [77]:
dataset_kaggle_dict = {'PassangerId': test['PassengerId'].copy(),'Survived':y_pred}
dataset_kaggle_df = pd.DataFrame(dataset_kaggle_dict)

In [79]:
dataset_kaggle_df.to_csv('C:/Users/Ato/Documents/Programming/Python/Titanic/predictions/RF.csv', index=False)