# Kaggle: Titanic Challenge
## Coded by Daniel Wilcox

This is a notebook showing the process in predicting the survivors of the Titanic.

In [None]:
import os
import pickle
import glob
import re

import pandas as pd
import numpy as np
import seaborn as sns

from scipy.stats import randint, reciprocal, expon, uniform

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone

#Fills in values to empty data locations
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Creating custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer




from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


#Classifier Models:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import NuSVC #-----

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier #-----
from sklearn.linear_model import Perceptron #-----
from sklearn.linear_model import RidgeClassifier #-----

from sklearn.gaussian_process import GaussianProcessClassifier #-----

from sklearn.naive_bayes import BernoulliNB #-----
from sklearn.naive_bayes import GaussianNB #-----

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier #-----

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #-----
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis #-----

from sklearn.neural_network import MLPClassifier #-----

from sklearn.ensemble import VotingClassifier


In [None]:
#The Location to save the dataset
TITANIC_PATH = "datasets/titanic"
TITANIC_URL = "https://github.com/Daniel-Wilcox/ADA-874-2019/blob/master/datasets/titanic/"
train_name = "train.csv" 
test_name = "test.csv" 


#The Location to save the models
PICKLE_PATH = "PickleModels/Titanic"

In [None]:
#Pickle functions

#Saving and storing the model
def save_pickle(model_name, model, pic_path=PICKLE_PATH):
    print("Saving model...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
        
    f = open(model_name, "wb")
    pickle.dump(model, f)
    f.close()
    
    os.chdir(cwd)
    print("Saved "+model_name+" successfully!\n")
    return None
    
    
#Retrieving and loading the model
def load_pickle(model_name, pic_path=PICKLE_PATH):
    print("Loading "+model_name+" from Pickle file...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
    
    f = open(model_name, "rb")
    p = pickle.load(f)
    f.close()
    
    os.chdir(cwd)
    print(model_name+" successfully loaded!\n")
    return p

#Check whether the pickel exists
def pickle_exist(model_name, pic_path=PICKLE_PATH):
    #check if pickle file exists
    print("Checking if pickle directory exists...")
    if not os.path.isdir(pic_path):
        os.makedirs(pic_path)
        print("Directory does NOT exists")
        print("Creating directory")
    
    else: 
        print("Directory exists")
        
    if os.path.isfile(pic_path+"/"+model_name):
        print("Pickle file does exists...")
        return True
    else:
        print("Pickle file does NOT exists...")
        return False

In [None]:
def load_Titanic_data(file_name, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    return pd.read_csv(csv_path)
        
    
def get_Titanic_data(file_name, titanic_url=TITANIC_URL, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    
    print("Checking if directory exists...")
    if not os.path.isdir(titanic_path):
        os.makedirs(titanic_path)
        print("Creating directory")
    
    else: 
        print("Directory exists") 
            
        if os.path.isfile(csv_path):
            print(file_name + " file does exists...")
            print("extracting " + file_name)
            
            titanic = load_Titanic_data(file_name)
            print("\nSuccess!")
            return titanic
        
        else:
            print(file_name + " file doesn't exists...")
            print("Download .csv from Kaggle!")

            return None
                   

In [None]:
def notify(title, text):
    os.system("""
              osascript -e 'display notification "{}" with title "{}"'
              """.format(text, title))
    os.system('osascript -e "beep 1"')
    

In [None]:
Train = get_Titanic_data(train_name)
Test = get_Titanic_data(test_name)

titanic = pd.concat(objs=[Train, Test], axis=0, sort=False).reset_index(drop=True)

#Fill "survived" of test data in titanic as 0:
titanic["Survived"] = titanic["Survived"].fillna(0)
titanic.head()

In [None]:
Train.info()

In [None]:
Test.info()

In [None]:
titanic.info()

In [None]:
titanic.isnull().sum()


### Variable - Definition                              - Key
1. survival - Survival                                - 0/1 = No/Yes
2. pclass   - Ticket class                            - 1,2,3 = 1st, 2nd, 3rd class
3. sex      - Sex                                     - male, female
4. Age      - Age in years                            - ...
5. sibsp    - # of siblings/spouses on the Titanic    - ...
6. parch    - # of parents/children on the Titanic    - ...
7. ticket   - Ticket number                           - ...
8. fare     - Passenger fare                          - ...
9. cabin    - Cabin number                            - ...
10. embarked - Port of Embarkation                     - C = Cherbourg, Q = Queenstown, S = Southampton


In [None]:
titanic.head()

In [None]:
sur = Train["Survived"].value_counts() / len(Train)
print("From the Train.csv dataset provided:")
print("{:.2f}% Survived\n{:.2f}% Died".format(100*sur[1],100*sur[0]))

In [None]:
#Correlation matrix (numeric features)
corr_matrix = Train.corr()
corr_matrix["Survived"].sort_values(ascending=False)

In [None]:
#Age: fill NaN's
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())

#Cabin:
titanic["Cabin"] = titanic["Cabin"].fillna('U')
titanic["Cabin"] = titanic["Cabin"].map(lambda x: x[0])

#Embarked: fill
most_embarked = titanic["Embarked"].value_counts().index[0]
titanic["Embarked"] = titanic["Embarked"].fillna(most_embarked)

#Fare: fill 
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())

#Name:
form_name = lambda x: x.split(',')[1].split('.')[0].strip()
titanic["Title"] = titanic["Name"].map(form_name)

titanic["Title"] = titanic["Title"].replace(['Don', 
        'Rev', 'Dr', 'Mme', 'Major', 'Lady', 
        'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
        'Jonkheer', 'Dona'], 'Rare')

titanic["Title"] = titanic["Title"].replace(['Mrs',
        'Miss', 'Ms'], 'Girls/Women')

titanic["Title"] = titanic["Title"].replace(['Master'],
        'Boys')

titanic["Title"] = titanic["Title"].replace(['Mr'],
        'Men')

titanic["Alone"] = 0
titanic["Alone"].loc[(titanic["SibSp"]==0) & (titanic["Parch"]==0)] = 1

titanic["Fam_size"] = titanic.loc[:,"SibSp"] + titanic.loc[:,"Parch"] + 1
titanic["fare_per_fam"] = titanic["Fare"]/titanic["Fam_size"]
            
        
form_sname = lambda x: x.split(',')[0].strip()
titanic["surname"] = titanic["Name"].map(form_sname)

titanic["short_Ticket"] = titanic["Ticket"].str[:-2]

f_Id = lambda x: '-'.join(x.map(str))
titanic["Fam_ID"] = titanic[["surname", "Pclass", "Embarked", "Cabin", "short_Ticket"]].apply(f_Id, axis=1)
titanic["Fam_ID"].loc[(titanic["Alone"] == 1)] = 'Alone'

#age_group
titanic["age_group"] = pd.qcut(titanic.Age, q=4, labels=False)        
    
#fare_group
titanic["Fare_group"] = pd.qcut(titanic.Fare, q=6, labels=False) 


titanic


In [None]:
#Pclass

fig = sns.barplot(x="Pclass",y="Survived",data=Train)
fig = fig.set(xlabel="Pclass", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger ticket class")


In [None]:
#Pclass w/ Sex

fig = sns.barplot(x="Pclass",y="Survived", hue="Sex", data=Train)
fig = fig.set(xlabel="Pclass", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger ticket class (w/ Sex)")


In [None]:
#Sex
fig = sns.barplot(x="Sex",y="Survived",data=Train)
fig = fig.set(xlabel="Sex", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger's Sex")


In [None]:
#Age

d = {'color': ['r', 'g']}   
fig = sns.FacetGrid(Train, col='Survived',  hue_kws=d, hue='Survived')
fig = fig.map(sns.distplot, "Age")   

fig = fig.set(xlabel="Age", ylabel="Survival Probability")

In [None]:
#SibSp

fig = sns.barplot(x="SibSp",y="Survived",data=Train)
fig = fig.set(xlabel="SibSp", ylabel="Survival Probability")
fig = plt.title("Survival probability for number of Siblings/Spouses of passenger")


In [None]:
#Parch

fig = sns.barplot(x="Parch",y="Survived",data=Train)
fig = fig.set(xlabel="Parch", ylabel="Survival Probability")
fig = plt.title("Survival probability for number of Parents/Children of passenger")


In [None]:
#Fare

d = {'color': ['r', 'g']}   
fig = sns.FacetGrid(Train, col='Survived',  hue_kws=d, hue='Survived')
fig = fig.map(sns.distplot, "Fare")   

fig = fig.set(xlabel="Fare", ylabel="Survival Probability")

In [None]:
#Cabin
fig = sns.countplot(x="Cabin",data=titanic)
fig = plt.setp(fig.get_xticklabels(), rotation=80) 
fig = plt.title("Count of cabin prefix (whole titanic)")

In [None]:
order_cabin = ['A','B','C','D','E','F','G','T','U']
fig = sns.factorplot(x="Cabin", y="Survived", data=titanic,
                    kind="bar", order=order_cabin)
fig = plt.title("Survival Probability of cabin prefix")


In [None]:
#Embarked

fig = sns.barplot(x="Embarked",y="Survived",data=Train)
fig = fig.set(xlabel="Embarked", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger port of embarkation ")


In [None]:
#feature eng:
Full_set = pd.concat(objs=[Train, Test], axis=0).reset_index(drop=True)

#Name: f_name, honorifics. sur_name
honorifics = [i.split(",")[1].split(".")[0].strip() for i in Full_set["Name"]]
Full_set["Title"] = pd.Series(honorifics)
Full_set.Title.unique()

In [None]:
fig = sns.countplot(x="Title",data=Full_set)
fig = plt.setp(fig.get_xticklabels(), rotation=80) 
fig = plt.title("Count of honorifics (whole titanic)")


In [None]:
fig = sns.barplot(x="Title",y="Survived",data=titanic)
fig = fig.set(xlabel="Honorific Title", ylabel="Survival Probability")
fig = plt.title("Survival probability for honorific titles")

In [None]:
#Family size

Full_set["Fam_size"] = Full_set["SibSp"] + Full_set["Parch"] + 1

fig = sns.barplot(x="Fam_size",y="Survived", data=Full_set)
fig = fig.set(xlabel="Fam_size", ylabel="Survival Probability")
fig = plt.title("Survival probability for family size")

In [None]:
#Alone
Full_set["Alone"] = 1
Full_set["Alone"].loc[Full_set['Fam_size'] > 1] = 0

fig = sns.barplot(x="Alone",y="Survived", data=Full_set)
fig = fig.set(xlabel="Alone", ylabel="Survival Probability")
fig = plt.title("Survival probability for alone passangers")

In [None]:
Train.shape

In [None]:
#Separtate labels from features
y_train = Train["Survived"].copy()
X_tr = Train.drop("Survived", axis=1)
X_tr.info()

In [None]:
#Features to add: 
fix_cabin=True #'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U' -num
has_cabin=True #has_cabin -num
add_fam=True #Fam_size -num
add_alone=True #Alone -num
fare_per_fam=True #fare_per_fam -num
add_title=True #Title -cat
add_famid=True #Fam_ID -num
add_ageG=True #age_group -num      
add_fareG=True #Fare_group -num

  
#Remove Features (survived is already removed)
exclude_col = ['Name', 'Ticket', 'Cabin']
  
Dropped = X_tr[list(set(X_tr.columns) - set(exclude_col))]

#Numeric Features
list_num = Dropped.select_dtypes(include = ["number"]).columns

#Catagorical Features (to be transformed into OHE)
list_cat = Dropped.select_dtypes(include = ["object"]).columns

list_add = X_tr[list(set(X_tr.columns))].columns

    
    

if fix_cabin:
    new_cabin = 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'
    for i in new_cabin:
        list_num = list_num.insert(len(list_num)+1, i)
    
if has_cabin:
    list_num = list_num.insert(len(list_num)+1, 'has_cabin')
    
if add_fam:
    list_num = list_num.insert(len(list_num)+1, 'Fam_size')
    
if add_alone:
    list_num = list_num.insert(len(list_num)+1, 'Alone')

if fare_per_fam:
    list_num = list_num.insert(len(list_num)+1, 'fare_per_fam')

if add_title:
    list_cat = list_cat.insert(len(list_cat)+1, 'Title')

#if add_famid:
    #list_num = list_num.insert(len(list_num)+1, 'Fam_ID')
    
if add_ageG:
    list_num = list_num.insert(len(list_num)+1, 'age_group')
    
if add_fareG:
    list_num = list_num.insert(len(list_num)+1, 'Fare_group')


 
print('list_num: {}\n'.format(list(list_num)))
print('list_cat: {}\n'.format(list(list_cat)))
print('list_add: {}\n'.format(list(list_add)))
list_add

In [None]:
X_tr["Name"].dtypes

In [None]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        return(self)
    
    def transform(self, X):
        return X[self.feature_names].values
    
    
#-------------------------------------------------------------------------------    
class add_features(BaseEstimator, TransformerMixin):
    def __init__(self, added_feat, fix_cabin=True, has_cabin=True,
                 add_fam=True, add_alone=True, fare_per_fam=True,
                 add_title=True, add_famid=True, add_ageG=True,
                 add_fareG=True):
        
        self.added_feat = added_feat
        
        self.fix_cabin = fix_cabin
        self.has_cabin = has_cabin
        self.add_fam = add_fam
        self.add_alone = add_alone
        self.fare_per_fam = fare_per_fam
        self.add_title = add_title    
        self.add_famid = add_famid
        self.add_ageG = add_ageG       
        self.add_fareG = add_fareG
                

    def fit(self, X, y=None):
        return(self)

    
    def transform(self, X):  
        df = X[self.added_feat]
        
        if self.fix_cabin:
            df["Cabin"] = df["Cabin"].fillna('U')
            df["Cabin"] = df["Cabin"].map(lambda x: x[0])
            
            for i in ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'U'):
                df[i] = 0
                df[i].loc[df["Cabin"] == i] = 1
            
            
        if self.has_cabin:
            df["has_cabin"] = 1
            df["has_cabin"].loc[df["Cabin"] == 'U'] = 0
            
            
        if self.add_fam:
            df["Fam_size"] = df.loc[:,"SibSp"] + df.loc[:,"Parch"] + 1
            
        if self.add_alone:
            df["Alone"] = 0
            df["Alone"].loc[(df["SibSp"]==0) & (df["Parch"]==0)] = 1
            
            
        if self.add_title: 
            df["Title"] = df["Name"].str.extract(r'((?<=, )[A-Za-z ]+(?=.))', expand = True)[0]
            
            
            
            df["Title"] = df.loc[:,"Title"].replace(
                ['Don', 'Rev', 'Dr', 'Mme', 'Major', 
                 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 
                 'the Countess', 'Jonkheer', 'Dona'],
                 'Rare')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Mrs','Miss','Ms'], 'Girls/Women')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Master'], 'Boys')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Mr'], 'Men') 

        
        #Fix NaN values of Fare and Age by Title, Sex and Class:
        fill_Nan = df.groupby(["Title", "Sex", "Pclass"])
        
        df["Age"].loc[(df["Age"] == 0)] = np.NaN
        df["Age"] = fill_Nan["Age"].apply(lambda x: x.fillna(x.median()))
                
        
        df["Fare"].loc[(df["Fare"] == 0)] = np.NaN
        df["Fare"] = fill_Nan["Fare"].apply(lambda x: x.fillna(x.median()))

        
        if self.fare_per_fam:
            df["fare_per_fam"] = df["Fare"]/df["Fam_size"]
            
        if self.add_famid:
            df["surname"] = df["Name"].str.split(",", n = 1, expand = True)[0]
            
            f_Id = lambda x: '-'.join(x.map(str))
            df["Fam_ID"] = df[["surname", "Pclass", "Embarked", "Cabin"]
                             ].apply(f_Id, axis=1)
            
            df["Fam_ID"].loc[(df["Alone"] == 1)] = 'Alone'


        if add_ageG:
            df["age_group"] = pd.qcut(df.Age, q=4, labels=False)        
    
        if add_fareG:
            df["Fare_group"] = pd.qcut(df.Fare, q=6, labels=False) 

        
        df.drop(labels=["Cabin", "Name", "Ticket", "surname"], axis = 1, inplace = True)
        
        return df 

In [None]:
#Add Features
add_pipeline = Pipeline([
    ('add_feat', add_features(list_add))
])

#Numeric Transformations
num_pipeline = Pipeline([
    ('selector', Selector(list_num)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

#Catagorical Transformations
cat_pipeline = Pipeline([
    ('selector', Selector(list_cat)),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

In [None]:
a = add_pipeline.fit_transform(titanic)

#a = add_pipeline.fit_transform(X_tr)
#b = add_pipeline.fit_transform(Test)

In [None]:
print('Transformed Dataframe: {}'.format(list(a.columns)))

In [None]:
a.head()


In [None]:
a.info()

In [None]:
a.isnull().sum()

In [None]:
perpare_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

full_pipe = Pipeline([
    ("add_pipeline", add_pipeline),
    ("prep_pipeline", perpare_pipeline)
])

X_train = full_pipe.fit_transform(X_tr)
X_train.shape

In [None]:
#Scaled Test Set for predictions:
X_test = full_pipe.fit_transform(Test)
X_test.shape

In [None]:
#To create new model if pickle already even if pickle exists; load_pkl = False
load_pkl = True

cv_split=StratifiedKFold(n_splits=10)
cv_split

In [None]:
def model_cross_val(clf, name_clf, lp=load_pickle):
    if pickle_exist(str(name_clf)) and (lp==True):
        ml_clf = load_pickle(str(name_clf))
    else:
        ml_clf = clf
        ml_clf.fit(X_train, y_train)
        save_pickle(str(name_clf), ml_clf)
    cvs =  cross_val_score(ml_clf, X_train, y_train, cv=cv_split, scoring="accuracy") 
    return  np.mean(cvs)


In [None]:
clf_options = [          
    LinearSVC(), #no proba
    SVC(probability=True),
    NuSVC(probability=True),
    
    LogisticRegression(),
    SGDClassifier(),
    PassiveAggressiveClassifier(), #no proba
    Perceptron(), #no proba
    RidgeClassifier(), #no proba
    
    GaussianProcessClassifier(),
    
    BernoulliNB(),
    GaussianNB(),
    
    DecisionTreeClassifier(),
    
    KNeighborsClassifier(),
    
    
    RandomForestClassifier(),
    AdaBoostClassifier(DecisionTreeClassifier(),
                       learning_rate=0.1),
    
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
    BaggingClassifier(),
    
    LinearDiscriminantAnalysis(),

    MLPClassifier()

] 
 

   
compare_col = ['Clf Name', 'Clf Parameters', 'Clf Mean Accuracy'] 
clf_compare = pd.DataFrame(columns = compare_col)
                 
row=0
          
for clf in clf_options: 
    clf_name = clf.__class__.__name__
                    
    clf_compare.loc[row, 'Clf Name'] = clf_name
    clf_compare.loc[row, 'Clf Parameters'] = str(clf.get_params()) 
    cvs = model_cross_val(clf, clf_name)
    
    clf_compare.loc[row, 'Clf Mean Accuracy'] = cvs
    
    row +=1

               

In [None]:
clf_compare.sort_values(by = 'Clf Mean Accuracy', ascending = False, inplace = True)
clf_compare

In [None]:
#Parameters of classifiers (Random Search):

#Linear SVC
lin_svc_rs = {
    'loss' : ['hinge','squared_hinge'],
    'C' : reciprocal(0.01, 1000)
}


#SVC
svc_rs = {
    'C' : reciprocal(0.01, 1000),
    #'kernel' : ['linear', 'rbf'],
    #'gamma' : reciprocal(0.01, 10000)
}

#NuSVC
nuSVC_rs = {
    'kernel' : ['linear', 'rbf'],
    'gamma' : reciprocal(0.01, 10000),
}



#Logistic Regression
log_reg_rs = {
    'penalty' : ['l1', 'l2'],
    'C' : reciprocal(0.01, 1000) 
}

#SGDClassifier
SGDC_rs = {
    'loss' : ['hinge','squared_hinge', 'perceptron'],
    'penalty' : ['l1', 'l2'],
    'alpha': reciprocal(0.1, 10000), 
    'warm_start' : [True, False]
}




#PassiveAggressiveClassifier
pass_rs = {
    'C' : reciprocal(0.01, 1000),
    'fit_intercept' : [True, False]
}


#Perceptron
perc_rs = {
    'alpha': reciprocal(0.1, 10000), 
    'fit_intercept' : [True, False],
    'shuffle' : [True, False],
    'warm_start' : [True, False]
}


#RidgeClassifier
ridge_rs = {
    'alpha': reciprocal(0.1, 10000)
}

#GaussianProcessClassifier
gaus_rs = {
    'warm_start' : [True, False]
}

    
#BernoulliNB
bernNB_rs = {
    'alpha': reciprocal(0.1, 10000),
    'binarize': reciprocal(0.1, 10000),
    'fit_prior' : [True, False]
    
}


#GaussianNB
guasNB_rs = {
    'var_smoothing': reciprocal(1, 1000000),
}

#Decision Tree Classifier
tree_rs = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : randint(1, 5),
    'min_samples_split' : randint(2, 10),
    'min_samples_leaf' : randint(1, 10),
    'max_features' : randint(1, 10)
}

  
#K-Neighbors Classifier
k_neigh_rs = {
    'n_neighbors': randint(3, 15),
    'weights' : ['uniform','distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : randint(2, 100),
    'p' : randint(1, 2)
}

#Random Forest Classifier
forest_rs = {
    'n_estimators' : randint(10, 500),
    'criterion' : ['gini', 'entropy'],
    'max_depth' : randint(1, 5),
    'min_samples_split' : randint(2, 15),
    'min_samples_leaf' : randint(1, 15),
    'max_features' : randint(1, 15)
}
    
#AdaBoost Classifier
ada_rs = {
    'base_estimator__criterion' : ['gini', 'entropy'],
    'base_estimator__splitter' : ['best', 'random'],
    'n_estimators' : randint(1, 50),
    'learning_rate' : reciprocal(0.6, 10000),
    'algorithm' : ['SAMME', 'SAMME.R']
    
}


#Extra Trees Classifier
extra_tree_rs = {
    'n_estimators' : randint(10, 500),
    'criterion' : ['gini', 'entropy'],
    'min_samples_split' : randint(2, 10),
    'min_samples_leaf' : randint(1, 10),
    'max_features' : randint(1, 10)    
    
}


#Gradient Boosting Classifier
grad_boost_rs = {
    'loss' : ['deviance', 'exponential'],
    'learning_rate' : reciprocal(0.6, 10000),
    'n_estimators' : randint(10, 500),
    'min_samples_split' : randint(2, 15),
    'min_samples_leaf' : randint(1, 15),
    'max_depth': randint(3, 15),
    'max_features' : randint(1, 15)
    
}  


bag_rs = {
    'n_estimators' : randint(10, 50)
}

#LinearDiscriminantAnalysis
linDes_rs = {
    'solver' : ['svd']
}


#MLPClassifier
MLPC_rs = {
    'hidden_layer_sizes' : randint(10, 1000),
    'alpha': reciprocal(0.1, 10000),
    'warm_start' : [True, False]
}



param_option_rs = [
    lin_svc_rs,
    svc_rs,
    nuSVC_rs,
    log_reg_rs,
    SGDC_rs,
    pass_rs,
    perc_rs,
    ridge_rs,
    gaus_rs,
    bernNB_rs,
    guasNB_rs,
    tree_rs,
    k_neigh_rs,
    forest_rs,
    ada_rs,
    extra_tree_rs,
    grad_boost_rs, 
    bag_rs,
    linDes_rs,
    MLPC_rs
    
    
]


In [None]:
#-------------------------------------------------------------------------------
def model_rand_gs(clf, name_clf, clf_param, lp=load_pickle): 

    rand_clf = RandomizedSearchCV(clf, param_distributions=clf_param, cv=cv_split, 
                          verbose=2, n_jobs=-1, n_iter=250, scoring='accuracy')
    
    rand_clf.fit(X_train, y_train)
    
    best_est = rand_clf.best_estimator_
    best_sco = rand_clf.best_score_ 
    
    text_check = os.path.isfile(PICKLE_PATH+"/best_score_rs_"+str(name_clf)+".txt")
    
    if pickle_exist("best_rs_"+str(name_clf)) and text_check and (lp==True):
        #load current best score
        prev_best_score = max(np.loadtxt(
            (PICKLE_PATH+"/best_score_rs_"+str(name_clf)+".txt"), dtype=float))

        if best_sco > prev_best_score:
            temp = [best_sco, best_sco]
            np.savetxt(
                PICKLE_PATH+"/best_score_rs_"+str(name_clf)+".txt", temp, fmt='%f')
            save_pickle("best_rs_"+str(name_clf), best_est)
        else:
            #load in better parameters
            best_sco = max(np.loadtxt(
                PICKLE_PATH+"/best_score_rs_"+str(name_clf)+".txt", dtype=float))
            best_est = load_pickle("best_rs_"+str(name_clf))
    else:
        #make pickles if dont exist
        temp = [best_sco, best_sco]
        np.savetxt(
            PICKLE_PATH+"/best_score_rs_"+str(name_clf)+".txt", temp, fmt='%f')
        save_pickle("best_rs_"+str(name_clf), best_est)


In [None]:
best_col = ['Clf Name', 'Best Clf Parameters', 'Best Clf Accuracy Score'] 
best_compare_rs = pd.DataFrame(columns = best_col)
          
          
row=0

for clf, param in zip(clf_options, param_option_rs): 
    
    clf_name = clf.__class__.__name__
    best_compare_rs.loc[row, 'Clf Name'] = clf_name
    
    print("{}: {}".format(row, clf_name))
    
    
    #model_grid(clf, clf_name, param)
    model_rand_gs(clf, clf_name, param)
   
    
    best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_rs_"+str(clf_name)+".txt"), dtype=float))
    best_clf = load_pickle("best_rs_"+str(clf_name))
    
    print("{}: {}\n".format(clf_name, best_clf.get_params))
        
    best_compare_rs.loc[row, 'Best Clf Parameters'] = str(best_clf.get_params())
    best_compare_rs.loc[row, 'Best Clf Accuracy Score'] = str(best_score)
    
    row +=1
                    

In [None]:
#Random Grid-search
best_compare_rs.sort_values(by = 'Best Clf Accuracy Score', ascending = False, inplace = True)

notify("Python: Kaggle", "Random Search is complete")
best_compare_rs


In [None]:
#Parameters of classifiers (Grid Search):

#Linear SVC
lin_svc_param = {
    'loss' : ['hinge','squared_hinge'],
    'C' : [1, 2, 5, 10]
}

#SVC
svc_param = {
    'C' : [1, 2, 5, 10],
    'kernel' : ['linear', 'rbf'],
    'gamma' : [ 0.001, 0.01, 0.1, 1]
}

#NuSVC
nuSVC_param = {
    'kernel' : ['linear', 'rbf'],
    'gamma' : [ 0.001, 0.01, 0.1, 1]
}

#Logistic Regression
log_reg_param = {
    'penalty' : ['l1', 'l2'],
    'C': [1, 2, 5, 10]
}

#SGDClassifier
SGDC_param = {
    'loss' : ['hinge','squared_hinge', 'perceptron'],
    'penalty' : ['l1', 'l2'],
    'alpha' : [0.001, 0.01, 0.1, 1],
    'warm_start' : [True, False]
}


#PassiveAggressiveClassifier
pass_param = {
    'C' : [1, 2, 5, 10],
    'fit_intercept' : [True, False]
}


#Perceptron
perc_param = {
    'alpha': [0.001, 0.01, 0.1, 1], 
    'fit_intercept' : [True, False],
    'shuffle' : [True, False],
    'warm_start' : [True, False]
}


#RidgeClassifier
ridge_param = {
    'alpha': [0.001, 0.01, 0.1, 1]
}

#GaussianProcessClassifier
gaus_param = {
    'warm_start' : [True, False]
}

    
#BernoulliNB
bernNB_param = {
    'alpha': [0.001, 0.01, 0.1, 1],
    'binarize': [0.001, 0.01, 0.1, 1],
    'fit_prior' : [True, False]
    
}


#GaussianNB
guasNB_param = {
    'var_smoothing': [0.001, 0.01, 0.1, 1],
}


#Decision Tree Classifier
tree_param = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 2, 3, 5],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 3, 10],
    'max_features' : [1, 5, 10]
}

#K-Neighbors Classifier
k_neigh_param = {
    'n_neighbors': [3, 4, 5, 10],
    'weights' : ['uniform','distance'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [2, 10, 50, 100],
    'p' : [1, 2]
}


#Random Forest Classifier
forest_param = {
    'n_estimators' : [10, 50, 100],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [1, 2, 5],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 3, 10],
    'max_features' : [1, 5, 10]
}

#AdaBoost Classifier
ada_param = {
    'base_estimator__criterion' : ['gini', 'entropy'],
    'base_estimator__splitter' : ['best', 'random'],
    'n_estimators' : [1, 2, 5, 10, 50],
    'learning_rate' : [0.001, 0.01, 0.1, 0.5, 1],
    'algorithm' : ['SAMME', 'SAMME.R']
    
}

#Extra Trees Classifier
extra_tree_param = {
    'n_estimators' : [100, 200, 300, 500],
    'criterion' : ['gini', 'entropy'],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 5,10],
    'max_features' : [1, 5, 10]    
    
}

#Gradient Boosting Classifier
grad_boost_param = {
    'loss' : ['deviance', 'exponential'],
    'learning_rate' : [0.01, 0.1, 1],
    'n_estimators' : [100, 200],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 10, 100],
    'max_depth': [3, 5, 10],
    'max_features': [0.1, 0.3]
    
}   

#Bagging Classifier
bag_param = {
    'n_estimators' : [10, 20, 50, 100, 200, 500],
}

#LinearDiscriminantAnalysis
linDes_param = {
    'solver' : ['svd']
}


#MLPClassifier
MLPC_param = {
    'hidden_layer_sizes' : [10, 50, 100, 500, 1000],
    'alpha': [0.001, 0.01, 0.1, 1],
    'warm_start' : [True, False]
}
               
                
param_option_gs = [
    lin_svc_param, #-
    svc_param, #-
    nuSVC_param, #-
    log_reg_param, #-
    SGDC_param, #-
    pass_param, #-
    perc_param, #-
    ridge_param, #-
    gaus_param,
    bernNB_param,
    guasNB_param,
    tree_param,
    k_neigh_param,
    forest_param,
    ada_param,
    extra_tree_param,
    grad_boost_param,
    bag_param,
    linDes_param,
    MLPC_param
]



In [None]:

def model_grid(clf, name_clf, clf_param, lp=load_pickle): 

    gs_clf = GridSearchCV(clf, param_grid=clf_param, cv=cv_split, 
                          verbose=2, n_jobs=-1, scoring='accuracy')
    
    gs_clf.fit(X_train, y_train)
    
    best_est = gs_clf.best_estimator_
    best_sco = gs_clf.best_score_ 
    
    text_check = os.path.isfile(PICKLE_PATH+"/best_score_gs_"+str(name_clf)+".txt")
    
    if pickle_exist("best_gs_"+str(name_clf)) and text_check and (lp==True):
        #load current best score
        prev_best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_gs_"+str(name_clf)+".txt"), dtype=float))

        if best_sco > prev_best_score:
            temp = [best_sco, best_sco]
            np.savetxt(PICKLE_PATH+"/best_score_gs_"+str(name_clf)+".txt", temp, fmt='%f')
            save_pickle("best_gs_"+str(name_clf), best_est)
        else:
            #load in better parameters
            best_sco = max(np.loadtxt(PICKLE_PATH+"/best_score_gs_"+str(name_clf)+".txt", dtype=float))
            best_est = load_pickle("best_gs_"+str(name_clf))
    else:
        #make pickles if dont exist
        temp = [best_sco, best_sco]
        np.savetxt(PICKLE_PATH+"/best_score_gs_"+str(name_clf)+".txt", temp, fmt='%f')
        save_pickle("best_gs_"+str(name_clf), best_est)

    
    

In [None]:
best_col = ['Clf Name', 'Best Clf Parameters', 'Best Clf Accuracy Score'] 
best_compare_gs = pd.DataFrame(columns = best_col)
          
          
row=0

for clf, param in zip(clf_options, param_option_gs): 

    
    clf_name = clf.__class__.__name__
    best_compare_gs.loc[row, 'Clf Name'] = clf_name
    
    print("{}: {}".format(row, clf_name))
    
    model_grid(clf, clf_name, param)
    
    best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_gs_"+str(clf_name)+".txt"), dtype=float))
    best_clf = load_pickle("best_gs_"+str(clf_name))
    
    best_compare_gs.loc[row, 'Best Clf Parameters'] = str(best_clf.get_params())
    best_compare_gs.loc[row, 'Best Clf Accuracy Score'] = str(best_score)
    
    row +=1

In [None]:
#After Gridseach
best_compare_gs.sort_values(by = 'Best Clf Accuracy Score', ascending = False, inplace = True)

notify("Python: Kaggle", "Grid Search is complete")
best_compare_gs

In [None]:
def load_best_clf(clf):
    clf_name = clf.__class__.__name__
    
    gs_score = max(np.loadtxt((PICKLE_PATH+"/best_score_gs_"+str(clf_name)+".txt"), dtype=float))
    rs_score = max(np.loadtxt((PICKLE_PATH+"/best_score_rs_"+str(clf_name)+".txt"), dtype=float))
    
    if gs_score > rs_score:
        return load_pickle("best_gs_"+str(clf_name))
    else:
        return load_pickle("best_rs_"+str(clf_name))

    
    
def load_best_score(clf):
    clf_name = clf.__class__.__name__
    
    gs_score = max(np.loadtxt((PICKLE_PATH+"/best_score_gs_"+str(clf_name)+".txt"), dtype=float))
    rs_score = max(np.loadtxt((PICKLE_PATH+"/best_score_rs_"+str(clf_name)+".txt"), dtype=float))
    
    if gs_score > rs_score:
        return gs_score
    else:
        return rs_score

In [None]:
best_col = ['Clf Name', 'Best Clf Parameters', 'Best Clf Accuracy Score'] 
best_rs_or_gs = pd.DataFrame(columns = best_col)
row=0

for clf in clf_options:
    clf_name = clf.__class__.__name__
    best_rs_or_gs.loc[row, 'Clf Name'] = clf_name
    
    best_clf  = load_best_clf(clf)
    best_score = load_best_score(clf)
    
    best_rs_or_gs.loc[row, 'Best Clf Parameters'] = str(best_clf.get_params())
    best_rs_or_gs.loc[row, 'Best Clf Accuracy Score'] = str(best_score)
    
    row +=1
    
best_rs_or_gs.sort_values(by = 'Best Clf Accuracy Score', ascending = False, inplace = True)
best_rs_or_gs    

In [None]:
#Feature importance:

#NOTE: add val set and retrain


In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, ver_index in split.split(Train, Train["Survived"]):
    Train_strat = Train.loc[train_index]
    Ver_strat = Train.loc[ver_index]

print("Training set: {} entries, Verificaiton set: {} entries".format(len(Train_strat),len(Ver_strat)))

In [None]:
y_training = Train_strat["Survived"].copy()
X_tr = Train_strat.drop("Survived", axis=1)
X_training = full_pipe.fit_transform(X_tr)

y_verification = Ver_strat["Survived"].copy()
X_ver = Ver_strat.drop("Survived", axis=1)
X_verification = full_pipe.fit_transform(X_ver)

In [None]:
hardVote = True
voter = 'soft'



#only probabilities: (ie default soft voting)
svc = load_best_clf(clf_options[1])
nu_svc = load_best_clf(clf_options[2])
log_reg = load_best_clf(clf_options[3])
gaus= load_best_clf(clf_options[8])
bernNB = load_best_clf(clf_options[9])
guasNB = load_best_clf(clf_options[10])
tree = load_best_clf(clf_options[11])
knn = load_best_clf(clf_options[12])
forest = load_best_clf(clf_options[13])
ada = load_best_clf(clf_options[14])
etree = load_best_clf(clf_options[15])
gradb = load_best_clf(clf_options[16])
bag = load_best_clf(clf_options[17])
linDes = load_best_clf(clf_options[18])
MLPC = load_best_clf(clf_options[19])


est = [('svc', svc), ('nu_svc', nu_svc), ('log_reg', log_reg),
       ('gaus', gaus), ('bernNB', bernNB), ('guasNB', guasNB), 
       ('tree', tree), ('knn', knn), ('forest', forest), 
       ('ada', ada), ('etree', etree), ('gradb', gradb), 
       ('bag', bag), ('linDes', linDes), ('MLPC', MLPC)]

if hardVote:
    #hard Voting: add probabilities
    voter = 'hard'
    
    lin_svc = load_best_clf(clf_options[0])
    SGDC = load_best_clf(clf_options[4])
    passC = load_best_clf(clf_options[5])
    perc = load_best_clf(clf_options[6])
    ridge = load_best_clf(clf_options[7])
    
    est_add = [('lin_svc', lin_svc), ('SGDC', SGDC),
               ('passC', passC), ('perc', perc), ('ridge', ridge)]
    
    est.extend(est_add)
    
    

print(est)
vote_clf = VotingClassifier(estimators=est, voting=voter)

vote_clf.fit(X_training, y_training)


In [None]:
for clf in (lin_svc, svc, log_reg, forest, 
            SGDC, tree, ada, etree, knn, 
            gradb, vote_clf):#, XGBC, vote_clf):

    clf.fit(X_training, y_training)
    y_pred = clf.predict(X_verification)
    
    name = clf.__class__.__name__
    score = accuracy_score(y_verification, y_pred)
    
    print("{}: {:.2f}%".format(name, 100*score))

In [None]:
#fit to whole dataset
vote_clf.fit(X_train, y_train)

In [None]:
if hardVote:
    vote_list = (svc, nu_svc, log_reg, gaus, bernNB, 
                 guasNB, tree, knn, forest, ada, 
                 etree, gradb, bag, linDes, MLPC)
        
else:
    vote_list = (svc, nu_svc, log_reg, SGDC, gaus, 
                 bernNB, guasNB, tree, knn, forest, 
                 ada, etree, gradb, bag, linDes, MLPC, 
                 lin_svc, passC, perc, ridge)
        

for clf in vote_list:

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_train)
    
    name = clf.__class__.__name__
    score = accuracy_score(y_train, y_pred)
    
    print("{}: {:.2f}%".format(name, 100*score))

In [None]:
def make_csv(csv_name, save_loc=TITANIC_PATH):
    curr_path = os.getcwd()
    save_path = os.path.join(curr_path, save_loc)
    os.chdir(save_path)
    
    max_i = 0
    
    len_name = len(csv_name)
           
    for file in glob.glob(csv_name+'*.csv'):
        
        file_name = file[:len(file)-4]
        file_ver = file_name[len_name:]
        
        if int(file_ver) > max_i:
            max_i = int(file_ver)
        
    new_ver = csv_name+str(max_i+1)+'.csv'
        
        
        
    os.chdir(curr_path)
    
    return os.path.join(save_path, new_ver)
    

In [None]:
PassengerId = Test['PassengerId']

Survived_pred = vote_clf.predict(X_test) 


Submission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': Survived_pred })

name = "Submission"
file_name = make_csv(name)

Submission.to_csv(file_name, index=False)


In [None]:
#top 5 clf:
best_idx = best_rs_or_gs["Clf Name"][0:5].index


clf1 = load_best_clf(clf_options[best_idx[0]])
clf2 = load_best_clf(clf_options[best_idx[1]])
clf3 = load_best_clf(clf_options[best_idx[2]])
clf4 = load_best_clf(clf_options[best_idx[3]])
clf5 = load_best_clf(clf_options[best_idx[4]])

In [None]:
est_best = [(clf1.__class__.__name__, clf1), 
            (clf2.__class__.__name__, clf2),
            (clf3.__class__.__name__, clf3),
            (clf4.__class__.__name__, clf4),
            (clf5.__class__.__name__, clf5)]
    
    
Best_vote = VotingClassifier(estimators=est_best, voting=voter)

Best_vote.fit(X_train, y_train)

In [None]:
#Best Predicitons:
PassengerId = Test['PassengerId']

Survived_pred = Best_vote.predict(X_test) 


Submission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': Survived_pred })

name = "B_Submission"
file_name = make_csv(name)

Submission.to_csv(file_name, index=False)
