# Titanic Project: Multilayer Perceptron


## Step 1: Import packages

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import joblib
import warnings


In [6]:
# Block unwanted warnings
warnings.filterwarnings('ignore', category = FutureWarning)
warnings.filterwarnings('ignore', category = DeprecationWarning)

## Step 2: Import the dataset

In [7]:
tr_features = pd.read_csv(r"C:\Users\smart\Desktop\GitHub\Titanic\data\X_train.csv")
tr_labels = pd.read_csv(r"C:\Users\smart\Desktop\GitHub\Titanic\data\y_train.csv",header = None)

In [8]:
print("number of rows in training features: ",len(tr_features.index))
tr_features.head()

number of rows in training features:  534


Unnamed: 0,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,2,0,62.0,10.5,0,0
1,3,0,8.0,29.125,5,0
2,3,0,32.0,56.4958,0,0
3,3,1,20.0,9.825,1,0
4,2,1,28.0,13.0,0,0


In [9]:
print("number of rows in training labels: ",len(tr_labels.index))
tr_labels.head()

number of rows in training labels:  534


Unnamed: 0,0
0,1
1,0
2,1
3,0
4,1


## Step 3: Explore the model & its hyper parameters

In [14]:
print(RandomForestRegressor())
RandomForestClassifier()
# Here we need RandomForestClassifier
# The 2 most important hyperparameters are: max_depth and n_estimators 
# n_estimators: number of individual decision tree in the ensemble
# max_depth: maximum depth allowed for each decision tree in the ensemble 

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Step 4: Create a function for reading hyperparameter evaluation 

In [11]:


def print_results(results):
    print('BEST PARAM: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
        
     

## Step 5: Run k-fold Cross Validation (CV) on the data; compare results for different hyperparameters

In [15]:
# The 2 most important hyperparameters are: max_depth and n_estimators 
# n_estimators: number of individual decision tree in the ensemble
# max_depth: maximum depth allowed for each decision tree in the ensemble 

rf = RandomForestClassifier()
parameters = {
    'n_estimators' : [5, 50, 250],
    'max_depth' : [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAM: {'max_depth': 4, 'n_estimators': 50}

0.809 (+/-0.082) for {'max_depth': 2, 'n_estimators': 5}
0.796 (+/-0.122) for {'max_depth': 2, 'n_estimators': 50}
0.803 (+/-0.113) for {'max_depth': 2, 'n_estimators': 250}
0.807 (+/-0.141) for {'max_depth': 4, 'n_estimators': 5}
0.826 (+/-0.128) for {'max_depth': 4, 'n_estimators': 50}
0.82 (+/-0.121) for {'max_depth': 4, 'n_estimators': 250}
0.818 (+/-0.085) for {'max_depth': 8, 'n_estimators': 5}
0.824 (+/-0.06) for {'max_depth': 8, 'n_estimators': 50}
0.813 (+/-0.068) for {'max_depth': 8, 'n_estimators': 250}
0.787 (+/-0.027) for {'max_depth': 16, 'n_estimators': 5}
0.809 (+/-0.034) for {'max_depth': 16, 'n_estimators': 50}
0.811 (+/-0.03) for {'max_depth': 16, 'n_estimators': 250}
0.798 (+/-0.05) for {'max_depth': 32, 'n_estimators': 5}
0.809 (+/-0.02) for {'max_depth': 32, 'n_estimators': 50}
0.809 (+/-0.033) for {'max_depth': 32, 'n_estimators': 250}
0.803 (+/-0.038) for {'max_depth': None, 'n_estimators': 5}
0.813 (+/-0.043) fo

In [None]:
# Note: lower acc in models with higher tee depth could indicate overfitting 
# Note: lower acc in models with lower tree depth could indicate underfitting

## Step 6: Select model with the best results

In [16]:
# Model with best results:
cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Step 7: Write out picked model 

In [17]:
joblib.dump(cv.best_estimator_, r"C:\Users\smart\Desktop\GitHub\Titanic\models\RF_model.pkl")

['C:\\Users\\smart\\Desktop\\GitHub\\Titanic\\models\\RF_model.pkl']