### TItanic Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift,KMeans
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import MeanShift, KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from ClusteringClassifier import ClusteringClassifier
from sklearn import metrics

In [2]:
# read data
df = pd.read_excel(io='titanic.xls')

# extract labels
y = np.array(df['survived'])

# split data on test and train set
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42, stratify=y, shuffle=True)

X_train.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1216,3,1,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13,,
819,3,1,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q,13,,"Co Clare, Ireland Washington, DC"
1286,3,1,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0,0,0,2688,7.2292,,C,C,,
1280,3,0,"Vovk, Mr. Janko",male,22.0,0,0,349252,7.8958,,S,,,
761,3,0,"de Pelsmaeker, Mr. Alfons",male,16.0,0,0,345778,9.5,,S,,,


In [3]:
# create useful transformers
class DataFrameSelector(BaseEstimator, TransformerMixin):
    ''' class to select columns from dataframe '''
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    ''' replace missing values by median '''
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [4]:
# create simple preprocessing data pipelines without any special feature engineering
num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
cat_cols = ['sex', 'embarked', 'home.dest']

num_pipeline = Pipeline([
    ('selecting_numeric', DataFrameSelector(num_cols) ),
    ('missing_data', SimpleImputer(strategy='median') ),
    ('normalize', StandardScaler() )
])

cat_pipeline = Pipeline([
    ('selecting_categorical', DataFrameSelector(cat_cols) ),
    ('missing_data', MostFrequentImputer() ),
    ('one_hot_ecoding', OneHotEncoder(sparse = False, handle_unknown='ignore') )
])

prep_pipeline = FeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

In [16]:
# initialize clustering classifier include two simple models with small hyperparameters optimalization
cluster_classifier = ClusteringClassifier(clf_grid_params=[{
                                                             'classifier': [LogisticRegression(max_iter=1000)], 
                                                             'classifier__C': [0.1, 1, 10] 
                                                            },
                                                            {
                                                             'classifier': [DecisionTreeClassifier()], 
                                                             'classifier__max_depth': [3, 5, 10],
                                                             'classifier__min_samples_split': [2, 4, 6]
                                                            }],
                                           clt = KMeans,
                                           clt_params={'n_clusters':4} )

# create final pipeline inluding classification
pipe = Pipeline(
                     [
                         ('prep', prep_pipeline),
                         ('cluster_classifier', cluster_classifier )
                     ]
)

# fit pipeline
pipe.fit(X_train,y_train)

# show accuracy on test set
y_pred = pipe.predict(X_test)[0] 
print(f'Accuracy on test set: {round(metrics.accuracy_score(y_test,y_pred),2)}')

# show prdicted clusters on test set
print(f'Predicted clusters on test set:\n{pipe.predict(X_test)[1]}')

Accuracy on test set: 0.82
Predicted clusters on test set:
[0 1 1 2 2 0 1 1 1 1 2 2 2 1 1 2 2 2 0 2 1 1 3 2 1 2 1 2 2 2 2 1 2 2 2 1 0
 3 1 2 2 2 2 2 1 1 2 0 0 2 0 2 3 2 1 1 2 1 3 2 0 2 1 2 0 2 2 2 1 2 1 0 1 2
 1 3 2 2 2 2 2 1 2 1 1 3 2 2 2 2 1 2 1 0 2 2 0 2 1 2 2 0 1 2 2 2 2 2 2 0 2
 2 0 3 2 2 1 1 2 2 2 2 2 0 2 3 2 2 2 2 0 2 2 2 2 1 1 2 2 2 2 1 1 2 1 2 0 2
 1 1 2 2 2 2 2 2 3 2 2 2 0 2 1 2 2 3 2 2 3 2 0 1 2 1 2 0 1 0 2 2 1 0 2 2 2
 1 1 0 2 2 1 2 2 2 1 3 1 2 2 2 1 1 1 2 1 0 1 2 0 1 2 2 0 2 1 1 0 1 1 3 0 2
 2 3 2 2 1 2 2 2 0 2 2 2 0 2 2 1 1 2 1 1 3 1 2 1 1 0 0 0 1 2 2 2 3 2 1 2 2
 0 1 2 2 3 3 3 2 1 2 2 2 2 2 2 1 2 2 2 2 1 1 2 0 1 1 2 2 1 2 1 1 2 2 0 2 2
 2 2 2 1 2 2 1 1 2 1 2 2 2 1 2 2 1 2 0 2 2 1 2 2 2 2 0 1 1 1 0 0]


In [11]:
# extract classifier from pipeline
model=pipe.steps[1][1]

# show models for each cluster
for i,clf in enumerate(model.clf_models, 1):
    try:
        print(f'Cluster{i}: {clf.steps[0][1]}')
    except AttributeError: # if classifier is OneClassClassifier
        print(f'Cluster{i}: {clf.__class__.__name__}')    
    print('-'*85)

Cluster1: LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
-------------------------------------------------------------------------------------
Cluster2: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
-------------------------------------------------------------------------------------
Cluster3: DecisionTreeClassifier(ccp_alpha=0.0, class_we