- we have cleaned and explored the data
- now i'll be using cleaned dataset
- First spliting the data and as 60% training, 20% validation and 20% for testing and then
- I'll be using 5 fold cross-validation on 5 algorithms
- Logistic Regression
- Support vector machine(SVM)
- Multilayer perceptron(MLP)
- Random Forest
- Boosting

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('cleaned.csv')

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Fam_cnt,cabin_ind
0,0,3,1,22.0,7.25,1,0
1,1,1,0,38.0,71.2833,1,1
2,1,3,0,26.0,7.925,0,0
3,1,1,0,35.0,53.1,1,1
4,0,3,1,35.0,8.05,0,0


# Spliting the data for train, test and validation

In [4]:
features = df.drop('Survived',axis=1)
label = df['Survived']

In [5]:
features.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Fam_cnt,cabin_ind
0,3,1,22.0,7.25,1,0
1,1,0,38.0,71.2833,1,1
2,3,0,26.0,7.925,0,0
3,1,0,35.0,53.1,1,1
4,3,1,35.0,8.05,0,0


In [6]:
label.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [7]:
x_train,x_test,y_train,y_test = train_test_split(features,label,test_size=0.4,random_state=42)
x_val,x_test,y_val,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=42)

In [8]:
x_train.to_csv('train_features.csv',index=False)
y_train.to_csv('train_label.csv',index=False)
x_test.to_csv('test_features.csv',index=False)
y_test.to_csv('test_label.csv',index=False)
x_val.to_csv('val_features.csv',index=False)
y_val.to_csv('val_label.csv',index=False)

In [9]:
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(label),1))

0.6
0.2
0.2


In [10]:
for dataset in [y_train, y_val, y_test]:
    print(len(dataset))

534
178
179


# 1) Train the model in Logistic Regression

In [11]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

LR_model = LogisticRegression()

In [12]:
tr_feature = pd.read_csv('train_features.csv')
tr_label = pd.read_csv('train_label.csv')

In [13]:
def print_result(results):
    print(f'BEST PARAMS : {results.best_params_}')
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    
    for mean,std,param in zip(means,stds,params):
        print(f'{round(mean,3)} (+/-{round(std *2,3)} for {param}')

In [14]:
parameters = {'C' : [0.001,0.01,0.1,1,10,100,1000]}
cv = GridSearchCV(LR_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)

BEST PARAMS : {'C': 1}
0.67 (+/-0.077 for {'C': 0.001}
0.708 (+/-0.098 for {'C': 0.01}
0.777 (+/-0.134 for {'C': 0.1}
0.8 (+/-0.118 for {'C': 1}
0.794 (+/-0.116 for {'C': 10}
0.794 (+/-0.116 for {'C': 100}
0.794 (+/-0.116 for {'C': 1000}


In [15]:
cv.best_estimator_

LogisticRegression(C=1)

In [16]:
joblib.dump(cv.best_estimator_,'LR_model.pkl')

['LR_model.pkl']

# 2) Train the model in Support Vector Machine(SVM)

In [17]:
from sklearn.svm import SVC
svm_model = SVC()

In [18]:
parameters = {
    'C':[0.1,1,10],
    'kernel':['linear','rbf']}
cv = GridSearchCV(svm_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)

BEST PARAMS : {'C': 0.1, 'kernel': 'linear'}
0.796 (+/-0.115 for {'C': 0.1, 'kernel': 'linear'}
0.654 (+/-0.06 for {'C': 0.1, 'kernel': 'rbf'}
0.796 (+/-0.115 for {'C': 1, 'kernel': 'linear'}
0.661 (+/-0.048 for {'C': 1, 'kernel': 'rbf'}
0.796 (+/-0.115 for {'C': 10, 'kernel': 'linear'}
0.684 (+/-0.07 for {'C': 10, 'kernel': 'rbf'}


In [19]:
joblib.dump(cv.best_estimator_,'svm_model.pkl')

['svm_model.pkl']

# 3) Train the model in Multilayer Perceptron(MLP)

In [20]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()

In [33]:
parameters = {'hidden_layer_sizes' : [(10,),(50,),(100,)],
             'learning_rate': ['constant', 'invscaling', 'adaptive'],
             'activation': ['relu', 'tanh', 'logistic']}
cv = GridSearchCV(mlp_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)









BEST PARAMS : {'activation': 'tanh', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.723 (+/-0.096 for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.725 (+/-0.085 for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.663 (+/-0.315 for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.774 (+/-0.125 for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.774 (+/-0.127 for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.781 (+/-0.126 for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.79 (+/-0.082 for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.768 (+/-0.079 for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.792 (+/-0.105 for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate':



In [40]:
joblib.dump(cv.best_estimator_,'MLP_model.pkl')

['MLP_model.pkl']

# 4) Train the model in RandomForest

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

In [24]:
parameters = {'n_estimators':[5,10,25,30,100,150,200],
             'max_depth': [2,4,8,16,32,64,None]}
cv = GridSearchCV(rf_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)

BEST PARAMS : {'max_depth': 4, 'n_estimators': 150}
0.77 (+/-0.134 for {'max_depth': 2, 'n_estimators': 5}
0.785 (+/-0.137 for {'max_depth': 2, 'n_estimators': 10}
0.783 (+/-0.117 for {'max_depth': 2, 'n_estimators': 25}
0.794 (+/-0.135 for {'max_depth': 2, 'n_estimators': 30}
0.796 (+/-0.116 for {'max_depth': 2, 'n_estimators': 100}
0.802 (+/-0.116 for {'max_depth': 2, 'n_estimators': 150}
0.796 (+/-0.12 for {'max_depth': 2, 'n_estimators': 200}
0.815 (+/-0.107 for {'max_depth': 4, 'n_estimators': 5}
0.805 (+/-0.092 for {'max_depth': 4, 'n_estimators': 10}
0.815 (+/-0.132 for {'max_depth': 4, 'n_estimators': 25}
0.826 (+/-0.119 for {'max_depth': 4, 'n_estimators': 30}
0.82 (+/-0.113 for {'max_depth': 4, 'n_estimators': 100}
0.828 (+/-0.113 for {'max_depth': 4, 'n_estimators': 150}
0.818 (+/-0.122 for {'max_depth': 4, 'n_estimators': 200}
0.796 (+/-0.046 for {'max_depth': 8, 'n_estimators': 5}
0.817 (+/-0.092 for {'max_depth': 8, 'n_estimators': 10}
0.813 (+/-0.084 for {'max_depth': 8,

In [25]:
joblib.dump(cv.best_estimator_,'rf_model.pkl')

['rf_model.pkl']

# 5) Train the model in Boosting

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

bst_model = GradientBoostingClassifier()

In [27]:
parameters = {'learning_rate':[0.001,0.01,0.1,1,5,10,50,100],
              'n_estimators':[5,10,50,100,150,200],
              'max_depth':[1,3,5,6,7,9]}
cv = GridSearchCV(bst_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)

BEST PARAMS : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 5}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 10}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 50}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 100}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 150}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 200}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 5}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100}
0.624 (+/-0.007 for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 150}
0.624 (+/-0.007 for {'learning_ra

In [28]:
joblib.dump(cv.best_estimator_,'bst_model.pkl')

['bst_model.pkl']

## Summary: Compare model results and final model selection
1. Evaluate all of our saved models on the validation set
2. Select the best model based on performance on the validation set
3. Evaluate that model on the holdout test set

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

val_features = pd.read_csv('val_features.csv')
val_labels = pd.read_csv('val_label.csv')

te_features = pd.read_csv('test_features.csv')
te_labels = pd.read_csv('test_label.csv')

In [41]:
model = {}
for mdl in ['bst','LR','MLP','rf','svm']:
    model[mdl] = joblib.load(f'{mdl}_model.pkl')

In [42]:
model

{'bst': GradientBoostingClassifier(n_estimators=50),
 'LR': LogisticRegression(C=1),
 'MLP': MLPClassifier(activation='tanh'),
 'rf': RandomForestClassifier(max_depth=4, n_estimators=150),
 'svm': SVC(C=0.1, kernel='linear')}

In [57]:
def evaluuate_model(name,model,features,labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels,pred),3)
    precision = round(precision_score(labels,pred),3)
    recall = round(recall_score(labels,pred),3)
    print(f'{name}--Accuracy: {accuracy}/ Precision: {precision} / Recall: {recall} / Latency: {round((end-start)*1000,1)}ms')

In [67]:
for name,mdl in model.items():
    evaluuate_model(name,mdl,val_features,val_labels)

bst--Accuracy: 0.809/ Precision: 0.804 / Recall: 0.631 / Latency: 10.9ms
LR--Accuracy: 0.775/ Precision: 0.712 / Recall: 0.646 / Latency: 3.5ms
MLP--Accuracy: 0.781/ Precision: 0.732 / Recall: 0.631 / Latency: 4.7ms
rf--Accuracy: 0.815/ Precision: 0.82 / Recall: 0.631 / Latency: 36.4ms
svm--Accuracy: 0.747/ Precision: 0.672 / Recall: 0.6 / Latency: 6.7ms


### Evaluate best model on test set

In [68]:
evaluuate_model('RandomForest',model['rf'],te_features,te_labels)

RandomForest--Accuracy: 0.804/ Precision: 0.86 / Recall: 0.645 / Latency: 56.9ms
