In [None]:
import pandas as pd
import numpy as np
import os

TITANIC_PATH = os.path.join("datasets", "titanic")

def load_data(path = TITANIC_PATH):
    return pd.read_csv(os.path.join(path, "train.csv")), pd.read_csv(os.path.join(path, "test.csv"))

In [None]:
train_set, test_set = load_data()

In [None]:
train_set.head()

In [None]:
train_set.info()

In [None]:
data = train_set.copy()
data_labels = train_set["Survived"]

data.drop(["PassengerId"], axis=1, inplace=True)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(figsize=(20,15))
plt.show()

In [None]:
corr_matrix = data.corr()
print(corr_matrix["Survived"].sort_values(ascending=False))

In [None]:
# Compare survival rate agains Age, SibSp, Parch and Fare
pd.pivot_table(data, index="Survived", values=["Age", "SibSp", "Parch", "Fare"])

In [None]:
# Comparing survival and each of these categorical variables 
print(pd.pivot_table(data, index = 'Survived', columns = 'Pclass', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(data, index = 'Survived', columns = 'Sex', values = 'Ticket' ,aggfunc ='count'))
print()
print(pd.pivot_table(data, index = 'Survived', columns = 'Embarked', values = 'Ticket' ,aggfunc ='count'))

In [None]:
data['cabin_multiple'] = data.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
data['cabin_multiple'].value_counts()

In [None]:
 print(pd.pivot_table(data, index = 'Survived', columns = 'cabin_multiple', values = 'Ticket' ,aggfunc ='count'))

In [None]:
#Create category based on the cabin letter (n stands for null)
data['cabin_adv'] = data.Cabin.apply(lambda x: str(x)[0])

In [None]:
print(data.cabin_adv.value_counts())
print(pd.pivot_table(data, index = 'Survived', columns = 'cabin_adv', values = 'Ticket' ,aggfunc ='count')) 

In [None]:
# Numeric ticket vs non numeric
data['numeric_ticket'] = data.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)

In [None]:
data['numeric_ticket'].value_counts()

In [None]:
print(pd.pivot_table(data, index = 'Survived', columns = 'numeric_ticket', values = 'Ticket' ,aggfunc ='count')) 

In [None]:
# Feature Engineering with people's title
data['name_title'] = data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
data['name_title'].value_counts()

In [None]:
corr_matrix = data.corr()
print(corr_matrix["Survived"].sort_values(ascending=False))

In [None]:
data.info()

In [None]:
data_labels = data["Survived"]
data.drop(["Survived"], axis=1, inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")

In [None]:
data_num = data[["Age", "SibSp", "Parch", "Fare"]]
data_cat = data.drop(["Age", "SibSp", "Parch", "Fare"], axis=1)

In [None]:
data_cat.drop(["Name", "Cabin"],axis=1, inplace=True)

In [None]:
data_cat.info()

In [None]:
data_cat["Embarked"].value_counts()

In [None]:
data_cat["Embarked"].fillna(value="S",inplace=True)

In [None]:
data_cat["name_title_privilege_group"] = data.name_title.apply(lambda x: 0 if x in ('Mr', 'Miss', 'Mrs') else 1)

In [None]:
data_cat.drop(["name_title"], axis=1, inplace=True)

In [None]:
data_cat.info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class AttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X['cabin_multiple'] = X.Cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
        X['cabin_adv'] = X.Cabin.apply(lambda x: str(x)[0])
        X['numeric_ticket'] = X.Ticket.apply(lambda x: 1 if x.isnumeric() else 0)
        X['name_title'] = data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
        X["name_title_privilege_group"] = X.name_title.apply(lambda x: 0 if x in ('Mr', 'Miss', 'Mrs') else 1)
        X.drop(["name_title", "Name", "Cabin", "PassengerId", "Ticket"], axis=1, inplace=True)
        X["Embarked"].fillna(value="S",inplace=True)
        return X
attr_adder = AttributeAdder()

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('std_scaler', StandardScaler()),
])


In [None]:
from sklearn.compose import ColumnTransformer

num_col = ["Age", "SibSp", "Parch", "Fare"]
cat_col = ["Pclass", "Sex", "Embarked", "cabin_multiple", "cabin_adv", "numeric_ticket", "name_title_privilege_group"]

train_set.drop(["Survived"], axis=1, inplace=True)
X_train_attr_added = attr_adder.transform(train_set)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_col),
    ("cat", OneHotEncoder(), cat_col)
])

X_train = full_pipeline.fit_transform(X_train_attr_added)
X_train
y_train = data_labels

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
#I usually use Naive Bayes as a baseline for my classification tasks 
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
dt = tree.DecisionTreeClassifier(random_state = 1)
cv = cross_val_score(dt,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
rf = RandomForestClassifier(random_state = 1)
cv = cross_val_score(rf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
svc = SVC(probability = True)
cv = cross_val_score(svc,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state =1)
cv = cross_val_score(xgb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
#Voting classifier takes all of the inputs and averages the results. For a "hard" voting classifier each classifier gets 1 vote "yes" or "no" and the result is just a popular vote. For this, you generally want odd numbers
#A "soft" classifier averages the confidence of each of the models. If a the average confidence is > 50% that it is a 1 it will be counted as such
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr',lr),('knn',knn),('rf',rf),('gnb',gnb),('svc',svc),('xgb',xgb)], voting = 'soft') 

In [None]:
cv = cross_val_score(voting_clf,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

In [None]:
#simple performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [None]:
lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train,y_train)
clf_performance(best_clf_lr,'Logistic Regression')

In [None]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train,y_train)
clf_performance(best_clf_knn,'KNN')

In [None]:
svc = SVC(probability = True)
param_grid = tuned_parameters = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10],
                                  'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['linear'], 'C': [.1, 1, 10, 100, 1000]},
                                 {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': [.1, 1, 10, 100, 1000]}]
clf_svc = GridSearchCV(svc, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_svc = clf_svc.fit(X_train,y_train)
clf_performance(best_clf_svc,'SVC')

In [None]:
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [400,450,500,550],
               'criterion':['gini','entropy'],
                                  'bootstrap': [True],
                                  'max_depth': [15, 20, 25],
                                  'max_features': ['auto','sqrt', 10],
                                  'min_samples_leaf': [2,3],
                                  'min_samples_split': [2,3]}
                                  
clf_rf = GridSearchCV(rf, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf = clf_rf.fit(X_train,y_train)
clf_performance(best_clf_rf,'Random Forest')

In [None]:
rf = RandomForestClassifier(random_state = 1)
param_grid =  {'n_estimators': [100,500,1000], 
                                  'bootstrap': [True,False],
                                  'max_depth': [3,5,10,20,50,75,100,None],
                                  'max_features': ['auto','sqrt'],
                                  'min_samples_leaf': [1,2,4,10],
                                  'min_samples_split': [2,5,10]}
                                  
clf_rf_rnd = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 100, cv = 5, verbose = True, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train,y_train)
clf_performance(best_clf_rf_rnd,'Random Forest')

In [None]:
xgb = XGBClassifier(random_state = 1)

param_grid = {
    'n_estimators': [450,500,550],
    'colsample_bytree': [0.75,0.8,0.85],
    'max_depth': [None],
    'reg_alpha': [1],
    'reg_lambda': [2, 5, 10],
    'subsample': [0.55, 0.6, .65],
    'learning_rate':[0.5],
    'gamma':[.5,1,2],
    'min_child_weight':[0.01],
    'sampling_method': ['uniform']
}

clf_xgb = GridSearchCV(xgb, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(X_train,y_train)
clf_performance(best_clf_xgb,'XGB')

In [None]:
best_lr = best_clf_lr.best_estimator_
best_knn = best_clf_knn.best_estimator_
best_svc = best_clf_svc.best_estimator_
best_rf = best_clf_rf.best_estimator_
best_xgb = best_clf_xgb.best_estimator_

voting_clf_hard = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'hard') 
voting_clf_soft = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc)], voting = 'soft') 
voting_clf_all = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('lr', best_lr)], voting = 'soft') 
voting_clf_xgb = VotingClassifier(estimators = [('knn',best_knn),('rf',best_rf),('svc',best_svc), ('xgb', best_xgb),('lr', best_lr)], voting = 'soft')

print('voting_clf_hard :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5))
print('voting_clf_hard mean :',cross_val_score(voting_clf_hard,X_train,y_train,cv=5).mean())

print('voting_clf_soft :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5))
print('voting_clf_soft mean :',cross_val_score(voting_clf_soft,X_train,y_train,cv=5).mean())

print('voting_clf_all :',cross_val_score(voting_clf_all,X_train,y_train,cv=5))
print('voting_clf_all mean :',cross_val_score(voting_clf_all,X_train,y_train,cv=5).mean())

print('voting_clf_xgb :',cross_val_score(voting_clf_xgb,X_train,y_train,cv=5))
print('voting_clf_xgb mean :',cross_val_score(voting_clf_xgb,X_train,y_train,cv=5).mean())

In [None]:
#in a soft voting classifier you can weight some models more than others. I used a grid search to explore different weightings
#no new results here
params = {'weights' : [[1,1,1],[1,2,1],[1,1,2],[2,1,1],[2,2,1],[1,2,2],[2,1,2]]}

vote_weight = GridSearchCV(voting_clf_soft, param_grid = params, cv = 5, verbose = True, n_jobs = -1)
best_clf_weight = vote_weight.fit(X_train,y_train)
clf_performance(best_clf_weight,'VC Weights')

In [None]:
best_model = best_clf_weight.fit(X_train, data_labels)

In [None]:
# Transformation of the test set
X_test_attr_added = attr_adder.transform(test_set)
X_test = full_pipeline.transform(X_test_attr_added)

In [None]:
X_test_attr_added

In [None]:
#Make Predictions 
voting_clf_hard.fit(X_train, y_train)
voting_clf_soft.fit(X_train, y_train)
voting_clf_all.fit(X_train, y_train)
voting_clf_xgb.fit(X_train, y_train)

best_rf.fit(X_train, y_train)
y_hat_vc_hard = voting_clf_hard.predict(X_test).astype(int)
y_hat_rf = best_rf.predict(X_test).astype(int)
y_hat_vc_soft =  voting_clf_soft.predict(X_test).astype(int)
y_hat_vc_all = voting_clf_all.predict(X_test).astype(int)
y_hat_vc_xgb = voting_clf_xgb.predict(X_test).astype(int)