In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from ensemble import RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier as sklearn_RandomForestClassifier

from datasets.preprocessing import titanic_preprocessing

In [2]:
df = pd.read_csv('../../../datasets/titanic/train.csv')

In [3]:
df = titanic_preprocessing(df)

In [4]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.2500,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.9250,2
4,1,1,0,35.0,1,0,53.1000,2
5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,3,0,39.0,0,5,29.1250,1
887,0,2,1,27.0,0,0,13.0000,2
888,1,1,0,19.0,0,0,30.0000,2
890,1,1,1,26.0,0,0,30.0000,0


## RandomForestClassifier test

In [5]:
model = RandomForestClassifier()

In [6]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model.fit(X_train, y_train)

100%|██████████| 100/100 [00:01<00:00, 69.63it/s]


In [30]:
pred_train = model.predict(X_train)
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       344
           1       0.87      0.81      0.84       225

    accuracy                           0.88       569
   macro avg       0.88      0.87      0.87       569
weighted avg       0.88      0.88      0.88       569


In [31]:
pred_test = model.predict(X_test)
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.77      0.85      0.81        80
           1       0.78      0.68      0.73        63

    accuracy                           0.78       143
   macro avg       0.78      0.77      0.77       143
weighted avg       0.78      0.78      0.77       143


## Hyperparameters tuning

In [32]:
# a simple GridSearch algorithm

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_model = None
best_score = 0
best_params = {}
for n_estimators in [10, 50, 70, 100]:
    for max_depth in [None, 3, 5, 6, 7]:
        for min_samples_split in [2, 5, 10, 20]:
            for min_samples_leaf in [1, 2, 5, 10, 20]:
                model = RandomForestClassifier(
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf   
                )
                model.fit(X_train, y_train)
                
                a = accuracy_score(y_valid, model.predict(X_valid))
                if a > best_score:
                    best_score = a 
                    best_model = model
                    best_params = {
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf
                    }

100%|██████████| 10/10 [00:00<00:00, 84.85it/s]
100%|██████████| 10/10 [00:00<00:00, 133.26it/s]
100%|██████████| 10/10 [00:00<00:00, 182.29it/s]
100%|██████████| 10/10 [00:00<00:00, 247.64it/s]
100%|██████████| 10/10 [00:00<00:00, 290.00it/s]
100%|██████████| 10/10 [00:00<00:00, 184.65it/s]
100%|██████████| 10/10 [00:00<00:00, 191.38it/s]
100%|██████████| 10/10 [00:00<00:00, 262.32it/s]
100%|██████████| 10/10 [00:00<00:00, 275.85it/s]
100%|██████████| 10/10 [00:00<00:00, 312.94it/s]
100%|██████████| 10/10 [00:00<00:00, 375.87it/s]
100%|██████████| 10/10 [00:00<00:00, 374.02it/s]
100%|██████████| 10/10 [00:00<00:00, 363.84it/s]
100%|██████████| 10/10 [00:00<00:00, 364.99it/s]
100%|██████████| 10/10 [00:00<00:00, 371.82it/s]
100%|██████████| 10/10 [00:00<00:00, 376.18it/s]
100%|██████████| 10/10 [00:00<00:00, 376.39it/s]
100%|██████████| 10/10 [00:00<00:00, 371.36it/s]
100%|██████████| 10/10 [00:00<00:00, 382.34it/s]
100%|██████████| 10/10 [00:00<00:00, 373.55it/s]
100%|██████████| 10/1

In [36]:
best_params

{'n_estimators': 10,
 'max_depth': 5,
 'min_samples_split': 2,
 'min_samples_leaf': 2}

In [37]:
best_score

0.868421052631579

In [38]:
pred_test = best_model.predict(X_test)
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.72      0.84      0.77        80
           1       0.74      0.59      0.65        63

    accuracy                           0.73       143
   macro avg       0.73      0.71      0.71       143
weighted avg       0.73      0.73      0.72       143


## sklearn RandomForest

In [40]:
model = sklearn_RandomForestClassifier()

In [41]:
model.fit(X_train, y_train)

In [44]:
pred_test = model.predict(X_test)
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81        80
           1       0.79      0.70      0.74        63

    accuracy                           0.78       143
   macro avg       0.78      0.77      0.78       143
weighted avg       0.78      0.78      0.78       143
