In [1]:
#Import libraries

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier, StackingCVClassifier
import cleantitanic as clean

In [2]:
#k-fold cross validation accuracy score function

def accuracy_score(estimator, cv=10, scoring='accuracy'):
    
    accuracies = cross_val_score(estimator=estimator, X=X, y=y, cv=cv, scoring=scoring)
    
    print('Model accuracy:\t {:.2f}%'.format(accuracies.mean()*100))

In [3]:
#Hyper-parameter tuning function

def tuning(estimator, parameters, scoring='accuracy', cv=10):
    
    grid_search = GridSearchCV(estimator = estimator,
                           param_grid = parameters,
                           scoring = scoring,
                           cv = cv,
                           n_jobs = -1)
    
    grid_search.fit(X, y)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_
    
    print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
    print("Best Parameters:", best_parameters)

In [4]:
#Import data

train, test, types = clean.clean()

In [5]:
#Correlation matrix heatmap

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.heatmap(data=train.corr(), cbar=False, cmap='coolwarm', annot=True)
plt.title('Correlation matrix')
plt.show()

<Figure size 1000x600 with 1 Axes>

In [6]:
#Create vector of features(X) and target variable(y)

features = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 
            'Fare', 'Cabin', 'Embarked', 'Title', 'Deck', 'Age*Class']

test = test[features]
X = train[features].append(test)
y = train['Survived']

In [7]:
#Encoding categorical data

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,2,6,8,9,10,11])], 
                       remainder='passthrough')

X = ct.fit_transform(X)

In [8]:
X.shape

(1309, 2449)

In [9]:
#Split train and test data

test = X[891: , :]
X = X[:891, :]

In [10]:
#Feature scaling

sc = StandardScaler(with_mean=False)
X[:, 2444:] = sc.fit_transform(X[:, 2444:])
test[:, 2444:] = sc.transform(test[:, 2444:])

In [11]:
#Logistic Regression

logistic_regressor = LogisticRegression()
logistic_regressor.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
#Regressor score

accuracy_score(logistic_regressor)



Model accuracy:	 83.62%


In [13]:
#Decision Tree Classifier

tree = DecisionTreeClassifier(max_depth = 10, random_state=0)
tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [14]:
#Decision Tree score

accuracy_score(tree)

Model accuracy:	 81.38%


In [15]:
#Random Forest Classifier

forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
#Random Forest score

accuracy_score(forest)

Model accuracy:	 83.40%


In [17]:
#Support Vector Machine(SVM)

svm = SVC(kernel='linear')
svm.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [18]:
#SVM score

accuracy_score(svm)

Model accuracy:	 85.52%


In [19]:
#Kernel SVM

kernel_svm = SVC(kernel='rbf', random_state=0, gamma=0.01, C=50)
kernel_svm.fit(X, y)

SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [20]:
#Kernel SVM score

accuracy_score(kernel_svm)

Model accuracy:	 86.31%


In [21]:
#SVM parameter tuning

svm_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], 
                   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}, 
                  {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], 
                   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}, 
                  {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}, 
                  {'kernel': ['poly'], 'degree': [0, 1, 2, 3, 4, 5, 6]}]

tuning(kernel_svm, svm_parameters)

Best Accuracy: 86.31 %
Best Parameters: {'C': 50, 'gamma': 0.01, 'kernel': 'rbf'}


In [22]:
#XGBoost classifier

xg = XGBClassifier(learning_rate=0.01)
xg.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [23]:
#XGBoost classifier score

accuracy_score(xg)

Model accuracy:	 82.28%


In [24]:
#Nearest neighbour classifier

knn = KNeighborsClassifier()
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
#Nearest neighbour score

accuracy_score(knn)

Model accuracy:	 83.06%


In [27]:
#ExtraTrees classifier

extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=0)
extra_trees.fit(X,y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [28]:
#ExtraTrees score

accuracy_score(extra_trees)

Model accuracy:	 83.84%


In [29]:
#AdaBoost classifier

ada = AdaBoostClassifier(n_estimators=100, random_state=0, base_estimator=extra_trees)
ada.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=ExtraTreesClassifier(bootstrap=False,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                               

In [30]:
#AdaBoost classifier score

accuracy_score(ada)

Model accuracy:	 84.29%


In [31]:
#GradientBoost classifier

grad_boost = GradientBoostingClassifier(random_state=0)
grad_boost.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [32]:
#GradientBoost score

accuracy_score(grad_boost)

Model accuracy:	 83.28%


In [33]:
#Stacking classifier
classifiers = [tree, forest, svm, kernel_svm, knn, extra_trees]

stack = StackingClassifier(classifiers=classifiers, 
                           meta_classifier=ada)

stack.fit(X, y)

StackingClassifier(average_probas=False,
                   classifiers=[DecisionTreeClassifier(class_weight=None,
                                                       criterion='gini',
                                                       max_depth=10,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort=False,
                                                       random_state=0,
                                                      

In [34]:
#Stack score
accuracy_score(stack)

Model accuracy:	 86.19%
