### TItanic Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift,KMeans
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from StackingClassifier import StackingClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

![title](Stack_Classifier.png)
<i> Schematic of a stacking classifier framework. Here, three classifiers are used in the stack and are individually trained. Then, their predictions get stacked and are used to train the meta-classifier. </i> 

<b>Source</b> https://towardsdatascience.com/stacking-classifiers-for-higher-predictive-performance-566f963e4840

In [2]:
# read data
df = pd.read_excel(io='titanic.xls')

# extract labels
y = np.array(df['survived'])

# split data on test and train set
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42, stratify=y, shuffle=True)

X_train.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1216,3,1,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13,,
819,3,1,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q,13,,"Co Clare, Ireland Washington, DC"
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0,0,0,2688,7.2292,,C,C,,
1280,3,0,"Vovk, Mr. Janko",male,22.0,0,0,349252,7.8958,,S,,,
761,3,0,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S,,,


In [3]:
# create useful transformers
class DataFrameSelector(BaseEstimator, TransformerMixin):
    ''' class to select columns from dataframe '''
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    ''' replace missing values by median '''
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [4]:
# create simple preprocessing data pipelines without any special feature engineering
num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
cat_cols = ['sex', 'embarked', 'home.dest']

num_pipeline = Pipeline([
    ('selecting_numeric', DataFrameSelector(num_cols) ),
    ('missing_data', SimpleImputer(strategy='median') ),
    ('normalize', StandardScaler() )
])

cat_pipeline = Pipeline([
    ('selecting_categorical', DataFrameSelector(cat_cols) ),
    ('missing_data', MostFrequentImputer() ),
    ('one_hot_ecoding', OneHotEncoder(sparse = False, handle_unknown='ignore') )
])

prep_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

In [5]:
# split training data to 5 folds
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# initialize stacking classifier include several, simple models as base learners and LogisticRegression as meta model
stack_clf = StackingClassifier(base_models=[
                                         LogisticRegression(), 
                                         SVC(max_iter=1000, probability=True), 
                                         KNeighborsClassifier(), 
                                         CatBoostClassifier(verbose=0),
                                         XGBClassifier(), 
                                         RandomForestClassifier()
                                        ],
                            meta_model=LogisticRegression(), 
                            kfold=kfold)

# create final pipeline inluding classification
pipe = Pipeline(
                     [
                         ('prep', prep_pipeline),
                         ('classifier', stack_clf )
                     ]
)

# fit pipeline
pipe.fit(X_train,y_train)

# show accuracy on test set
y_pred = pipe.predict(X_test) 
print(f'Accuracy on test set: {round(metrics.accuracy_score(y_test,y_pred),2)}')

Accuracy on test set: 0.83
