# Kaggle: Titanic Challenge
## Coded by Daniel Wilcox

This is a notebook showing the process in predicting the survivors of the Titanic.

In [None]:
import os
import pickle
import glob

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone

#Fills in values to empty data locations
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Creating custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer




from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV



from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier


In [None]:
#The Location to save the dataset
TITANIC_PATH = "datasets/titanic"
TITANIC_URL = "https://github.com/Daniel-Wilcox/ADA-874-2019/blob/master/datasets/titanic/"
train_name = "train.csv" 
test_name = "test.csv" 


#The Location to save the models
PICKLE_PATH = "PickleModels/Titanic"

In [None]:
#Pickle functions

#Saving and storing the model
def save_pickle(model_name, model, pic_path=PICKLE_PATH):
    print("Saving model...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
        
    f = open(model_name, "wb")
    pickle.dump(model, f)
    f.close()
    
    os.chdir(cwd)
    print("Saved "+model_name+" successfully!\n")
    return None
    
    
#Retrieving and loading the model
def load_pickle(model_name, pic_path=PICKLE_PATH):
    print("Loading "+model_name+" from Pickle file...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
    
    f = open(model_name, "rb")
    p = pickle.load(f)
    f.close()
    
    os.chdir(cwd)
    print(model_name+" successfully loaded!\n")
    return p

#Check whether the pickel exists
def pickle_exist(model_name, pic_path=PICKLE_PATH):
    #check if pickle file exists
    print("Checking if pickle directory exists...")
    if not os.path.isdir(pic_path):
        os.makedirs(pic_path)
        print("Directory does NOT exists")
        print("Creating directory")
    
    else: 
        print("Directory exists")
        
    if os.path.isfile(pic_path+"/"+model_name):
        print("Pickle file does exists...")
        return True
    else:
        print("Pickle file does NOT exists...")
        return False

In [None]:
def load_Titanic_data(file_name, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    return pd.read_csv(csv_path)
        
    
def get_Titanic_data(file_name, titanic_url=TITANIC_URL, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    
    print("Checking if directory exists...")
    if not os.path.isdir(titanic_path):
        os.makedirs(titanic_path)
        print("Creating directory")
    
    else: 
        print("Directory exists") 
            
        if os.path.isfile(csv_path):
            print(file_name + " file does exists...")
            print("extracting " + file_name)
            
            titanic = load_Titanic_data(file_name)
            print("\nSuccess!")
            return titanic
        
        else:
            print(file_name + " file doesn't exists...")
            print("Download .csv from Kaggle!")

            return None
                   

In [None]:
Train = get_Titanic_data(train_name)
Test = get_Titanic_data(test_name)


In [None]:
Train.info()

In [None]:
Train.isnull().sum()


### Variable - Definition                              - Key
1. survival - Survival                                - 0/1 = No/Yes
2. pclass   - Ticket class                            - 1,2,3 = 1st, 2nd, 3rd class
3. sex      - Sex                                     - male, female
4. Age      - Age in years                            - ...
5. sibsp    - # of siblings/spouses on the Titanic    - ...
6. parch    - # of parents/children on the Titanic    - ...
7. ticket   - Ticket number                           - ...
8. fare     - Passenger fare                          - ...
9. cabin    - Cabin number                            - ...
10. embarked - Port of Embarkation                     - C = Cherbourg, Q = Queenstown, S = Southampton


In [None]:
#It can be seen that there are some features that have NaN values.

In [None]:
Train.head(6)

In [None]:
sur = Train["Survived"].value_counts() / len(Train)
print("{:.2f}% Survived\n{:.2f}% Died".format(100*sur[1],100*sur[0]))

In [None]:
#Correlation matrix (numeric features)
corr_matrix = Train.corr()
corr_matrix["Survived"].sort_values(ascending=False)

In [None]:
#check the different features:

In [None]:
#Pclass

fig = sns.barplot(x="Pclass",y="Survived",data=Train)
fig = fig.set(xlabel="Pclass", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger ticket class")


In [None]:
#Pclass w/ Sex

fig = sns.barplot(x="Pclass",y="Survived", hue="Sex", data=Train)
fig = fig.set(xlabel="Pclass", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger ticket class (w/ Sex)")


In [None]:
#Sex

fig = sns.barplot(x="Sex",y="Survived",data=Train)
fig = fig.set(xlabel="Sex", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger's Sex")


In [None]:
#Age

d = {'color': ['r', 'g']}   
fig = sns.FacetGrid(Train, col='Survived',  hue_kws=d, hue='Survived')
fig = fig.map(sns.distplot, "Age")   

fig = fig.set(xlabel="Age", ylabel="Survival Probability")

In [None]:
#SibSp

fig = sns.barplot(x="SibSp",y="Survived",data=Train)
fig = fig.set(xlabel="SibSp", ylabel="Survival Probability")
fig = plt.title("Survival probability for number of Siblings/Spouses of passenger")


In [None]:
#Parch

fig = sns.barplot(x="Parch",y="Survived",data=Train)
fig = fig.set(xlabel="Parch", ylabel="Survival Probability")
fig = plt.title("Survival probability for number of Parents/Children of passenger")


In [None]:
#Fare

d = {'color': ['r', 'g']}   
fig = sns.FacetGrid(Train, col='Survived',  hue_kws=d, hue='Survived')
fig = fig.map(sns.distplot, "Fare")   

fig = fig.set(xlabel="Fare", ylabel="Survival Probability")

In [None]:
#Embarked

fig = sns.barplot(x="Embarked",y="Survived",data=Train)
fig = fig.set(xlabel="Embarked", ylabel="Survival Probability")
fig = plt.title("Survival probability of passanger port of embarkation ")


In [None]:
#feature eng:

#tr = Train.drop("Survived", axis=1)
#Full_set = pd.concat(objs=[tr, Test], axis=0).reset_index(drop=True)
Full_set = pd.concat(objs=[Train, Test], axis=0).reset_index(drop=True)


#Name: f_name, honorifics. sur_name
honorifics = [i.split(",")[1].split(".")[0].strip() for i in Full_set["Name"]]
Full_set["Title"] = pd.Series(honorifics)
Full_set.Title.unique()

In [None]:
fig = sns.countplot(x="Title",data=Full_set)
#fig = fig.set(xlabel="Title", ylabel="Survival Probability")
fig = plt.setp(fig.get_xticklabels(), rotation=80) 
fig = plt.title("Count of honorifics (Trainin + Testing)")


In [None]:
Full_set["Title"] = Full_set["Title"].replace(['Don', 
        'Rev', 'Dr', 'Mme', 'Major', 'Lady', 
        'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
        'Jonkheer', 'Dona'], 'Rare')

Full_set["Title"] = Full_set["Title"].replace(['Mrs',
        'Miss', 'Ms'], 'Girls/Women')

Full_set["Title"] = Full_set["Title"].replace(['Master'],
        'Boys')

Full_set["Title"] = Full_set["Title"].replace(['Mr'],
        'Men')
fig = sns.barplot(x="Title",y="Survived",data=Full_set)
fig = fig.set(xlabel="Honorific Title", ylabel="Survival Probability")
fig = plt.title("Survival probability for honorific titles")

In [None]:
#Family size

Full_set["Fam_size"] = Full_set["SibSp"] + Full_set["Parch"] + 1

fig = sns.barplot(x="Fam_size",y="Survived", data=Full_set)
fig = fig.set(xlabel="Fam_size", ylabel="Survival Probability")
fig = plt.title("Survival probability for family size")

In [None]:
#Alone
Full_set["Alone"] = 1
Full_set["Alone"].loc[Full_set['Fam_size'] > 1] = 0

fig = sns.barplot(x="Alone",y="Survived", data=Full_set)
fig = fig.set(xlabel="Alone", ylabel="Survival Probability")
fig = plt.title("Survival probability for alone passangers")

In [None]:
Train.shape

In [None]:
#Separtate labels from features
y_train = Train["Survived"].copy()
X_tr = Train.drop("Survived", axis=1)
X_tr.info()

In [None]:
#Remove Features (survived is already removed)
list_drop = ['Name', 'Ticket', 'Cabin']
list_excl = ['Ticket', 'Cabin'] 

add_fam_feat = True
add_alone = True 
add_title = True
    
Dropped = X_tr[list(set(X_tr.columns) - set(list_drop))]

#Numeric Features
list_num = Dropped.select_dtypes(include = ["number"]).columns

#Catagorical Features (to be transformed into OHE)
list_cat = Dropped.select_dtypes(include = ["object"]).columns

list_add = X_tr[list(set(X_tr.columns) - set(list_excl))].columns


if add_fam_feat:
    list_num = list_num.insert(len(list_num)+ 1,'Fam_size')
    
if add_alone:
    list_num = list_num.insert(len(list_num)+1,'Alone')    
    
if add_title:
    list_cat = list_cat.insert(len(list_num)+1,'Title')
    

print('list_num: {}'.format(list(list_num)))
print('list_cat: {}'.format(list(list_cat)))
print('list_add: {}'.format(list(list_add)))

In [None]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        return(self)
    
    def transform(self, X):
        return X[self.feature_names].values
    
    
    
class add_features(BaseEstimator, TransformerMixin):
    def __init__(self, added_feat, add_fam=True, 
                 add_alone=True, add_title=True):
        
        self.added_feat = added_feat
        self.add_fam = add_fam
        self.add_alone = add_alone
        self.add_title = add_title
   

    def fit(self, X, y=None):
        return(self)

    
    def transform(self, X):  
        df = X[self.added_feat]
        
        if self.add_fam:
            df["Fam_size"] = df.loc[:,"SibSp"] + df.loc[:,"Parch"] + 1
            
        if self.add_alone:
            df["Alone"] = 0
            df["Alone"].loc[(df["SibSp"]==0) & 
                           (df["Parch"]==0)] = 1
            
        if self.add_title:
            honorifics = [i.split(",")[1].split(".")[0].
                          strip() for i in df["Name"]]
            
            df["Title"] = pd.Series(honorifics)
            
            df["Title"] = df.loc[:,"Title"].replace(
                ['Don', 'Rev', 'Dr', 'Mme', 'Major', 
                 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 
                 'the Countess', 'Jonkheer', 'Dona'],
                 'Rare')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Mrs','Miss','Ms'], 'Girls/Women')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Master'], 'Boys')

            df["Title"] = df.loc[:,"Title"].replace(
                ['Mr'], 'Men') 
            
            df.drop(labels=["Name"], axis = 1, 
                   inplace = True)

        return df 

In [None]:
#Add Features
add_pipeline = Pipeline([
    ('add_feat', add_features(list_add))
])

#Numeric Transformations
num_pipeline = Pipeline([
    ('selector', Selector(list_num)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

#Catagorical Transformations
cat_pipeline = Pipeline([
    ('selector', Selector(list_cat)),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

In [None]:
a = add_pipeline.fit_transform(X_tr)

In [None]:
print('Transformed Dataframe: {}'.format(list(a.columns)))

In [None]:
a.head(5)

In [None]:
perpare_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

full_pipe = Pipeline([
    ("add_pipeline", add_pipeline),
    ("prep_pipeline", perpare_pipeline)
])

X_train = full_pipe.fit_transform(X_tr)
X_train.shape

In [None]:
#Scaled Test Set for predictions:
X_test = full_pipe.fit_transform(Test)
X_test.shape

In [None]:
#To create new model if pickle already even if pickle exists; load_pkl = False
load_pkl = True

cv_split=StratifiedKFold(n_splits=4, shuffle=False)

In [None]:
def model_cross_val(clf, name_clf, lp=load_pickle, cv=cv_split):
    if pickle_exist(str(name_clf)) and lp:
        ml_clf = load_pickle(str(name_clf))
    else:
        ml_clf = clf
        ml_clf.fit(X_train, y_train)
        save_pickle(str(name_clf), ml_clf)
    cvs =  cross_val_score(ml_clf, X_train, y_train, cv=3, scoring="accuracy") 
    return  np.mean(cvs)


In [None]:
clf_options = [          
    LinearSVC(),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SGDClassifier(),
    KNeighborsClassifier()
    ]          
 
compare_col = ['Clf Name', 'Clf Parameters', 'Clf Mean Accuracy'] 
clf_compare = pd.DataFrame(columns = compare_col)
          
          
row=0
          
for clf in clf_options: 
    clf_name = clf.__class__.__name__
                    
    clf_compare.loc[row, 'Clf Name'] = clf_name
    clf_compare.loc[row, 'Clf Parameters'] = str(clf.get_params()) 
    cvs = model_cross_val(clf, clf_name)
    
    clf_compare.loc[row, 'Clf Mean Accuracy'] = cvs
    
    row +=1

               

In [None]:
clf_compare.sort_values(by = 'Clf Mean Accuracy', ascending = False, inplace = True)
clf_compare

In [None]:
#Parameters of classifiers (Grid Search):

#Linear SVC
lin_svc_param = {
    'loss':['hinge','squared_hinge'],
    'C':[1, 2, 5, 10]
}

#SVC
svc_param = {
    'C' : [1, 2, 5, 10],
    'kernel' : ['linear', 'rbf'],
    'gamma': [ 0.001, 0.01, 0.1, 1],
}

#Logistic Regression
log_reg_param = {
    'C': [1, 2, 5, 10],
    'penalty' : ['l1', 'l2']
}

#Decision Tree Classifier
tree_param = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 2, 3, 5]
}

#Random Forest Classifier
forest_param = {
    'n_estimators' : [100, 200, 500],
    'criterion' : ['gini'],
    'max_features' : [1, 3, 5, 10],
    'min_samples_split' : [2, 3, 5, 10],
    'min_samples_leaf' : [1, 2, 3, 10]
}

#SGDClassifier
SGDC_param = {
    'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10]
}

#K-Neighbors Classifier
k_neigh_param = {
    'n_neighbors': [3, 4, 5],
    'weights': ['uniform','distance']
}

param_option_gs = [
    lin_svc_param,
    svc_param,
    log_reg_param,
    tree_param,
    forest_param,
    SGDC_param,
    k_neigh_param 
]

In [None]:
#Parameters of classifiers (Random Search):

#Linear SVC
lin_svc_rs = {
    'loss':('hinge','squared_hinge'),
    'C':[1, 100]
}

#SVC
svc_rs = {
    'C' : [1, 100],
    'kernel' : ['linear', 'rbf'],
    'gamma': [ 0.001, 10],
}

#Logistic Regression
log_reg_rs = {
    'C': [1, 100],
    'penalty' : ['l1', 'l2']
}

#Decision Tree Classifier
tree_rs = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 15]
}

#Random Forest Classifier
forest_rs = {
    'n_estimators' : [100, 1000],
    'criterion' : ['gini'],
    'min_samples_split' : [2, 15],
    'min_samples_leaf' : [1, 15]
}

#SGDClassifier
SGDC_rs = {
    'alpha':[0.0001, 10]
}

#K-Neighbors Classifier
k_neigh_rs = {
    'n_neighbors': [2, 10],
    'weights': ['uniform','distance']
}

param_option_rs = [
    lin_svc_rs,
    svc_rs,
    log_reg_rs,
    tree_rs,
    forest_rs,
    SGDC_rs,
    k_neigh_rs 
]

In [None]:
#-------------------------------------------------------------------------------
def model_rand_gs(clf, name_clf, clf_param, lp=load_pickle, cv=cv_split): 

    rand_clf = RandomizedSearchCV(clf, param_distributions=clf_param, cv=cv_split, 
                          verbose=2, n_jobs=-1, n_iter=15000, scoring='accuracy')
    
    rand_clf.fit(X_train, y_train)
    
    best_est = rand_clf.best_estimator_
    best_sco = rand_clf.best_score_ 
    
    text_check = os.path.isfile(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt")
    
    if pickle_exist("best_"+str(name_clf)) and text_check and lp:
        #load current best score
        prev_best_score = max(np.loadtxt(
            (PICKLE_PATH+"/best_score_"+str(name_clf)+".txt"), dtype=float))

        if best_sco > prev_best_score:
            temp = [best_sco, best_sco]
            np.savetxt(
                PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
            save_pickle("best_"+str(name_clf), best_est)
        else:
            #load in better parameters
            print("****loaded****")
            best_sco = max(np.loadtxt(
                PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", dtype=float))
            best_est = load_pickle("best_"+str(name_clf))
    else:
        #make pickles if dont exist
        temp = [best_sco, best_sco]
        np.savetxt(
            PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
        save_pickle("best_"+str(name_clf), best_est)


In [None]:

def model_grid(clf, name_clf, clf_param, lp=load_pickle, cv=cv_split): 

    gs_clf = GridSearchCV(clf, param_grid=clf_param, cv=cv_split, 
                          verbose=2, n_jobs=-1, scoring='accuracy')
    
    gs_clf.fit(X_train, y_train)
    
    best_est = gs_clf.best_estimator_
    best_sco = gs_clf.best_score_ 
    
    text_check = os.path.isfile(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt")
    
    if pickle_exist("best_"+str(name_clf)) and text_check and lp:
        #load current best score
        prev_best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_"+str(name_clf)+".txt"), dtype=float))

        if best_sco > prev_best_score:
            temp = [best_sco, best_sco]
            np.savetxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
            save_pickle("best_"+str(name_clf), best_est)
        else:
            #load in better parameters
            best_sco = max(np.loadtxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", dtype=float))
            best_est = load_pickle("best_"+str(name_clf))
    else:
        #make pickles if dont exist
        temp = [best_sco, best_sco]
        np.savetxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
        save_pickle("best_"+str(name_clf), best_est)

    
    

In [None]:
best_col = ['Clf Name', 'Best Clf Parameters', 'Best Clf Accuracy Score'] 
best_compare = pd.DataFrame(columns = best_col)
          
          
row=0

#for clf, param in zip(clf_options, param_option_gs): 
for clf, param in zip(clf_options, param_option_rs): 
    
    clf_name = clf.__class__.__name__
    best_compare.loc[row, 'Clf Name'] = clf_name
    
    print("{}: {}".format(row, clf_name))
    
    #model_grid(clf, clf_name, param)
    model_rand_gs(clf, clf_name, param)
    
    best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_"+str(clf_name)+".txt"), dtype=float))
    best_clf = load_pickle("best_"+str(clf_name))
    
    best_compare.loc[row, 'Best Clf Parameters'] = str(best_clf.get_params())
    best_compare.loc[row, 'Best Clf Accuracy Score'] = str(best_score)
    
    row +=1
                    

In [None]:
#Before Grid-search
clf_compare

In [None]:
#After Gridseach
best_compare.sort_values(by = 'Best Clf Accuracy Score', ascending = False, inplace = True)
best_compare

In [None]:
def load_best(clf):
    clf_name = clf.__class__.__name__
    return load_pickle("best_"+str(clf_name))
    

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, ver_index in split.split(Train, Train["Survived"]):
    Train_strat = Train.loc[train_index]
    Ver_strat = Train.loc[ver_index]

print("Training set: {} entries, Verificaiton set: {} entries".format(len(Train_strat),len(Ver_strat)))

In [None]:
y_training = Train_strat["Survived"].copy()
X_tr = Train_strat.drop("Survived", axis=1)
X_training = perpare_pipeline.fit_transform(X_tr)

y_verification = Ver_strat["Survived"].copy()
X_ver = Ver_strat.drop("Survived", axis=1)
X_verification = perpare_pipeline.fit_transform(X_ver)

In [None]:
lin_svc = load_best(clf_options[0])
svc = load_best(clf_options[1])
log_reg = load_best(clf_options[2])
tree = load_best(clf_options[3])
forest = load_best(clf_options[4])
SGDC = load_best(clf_options[5])
knn = load_best(clf_options[6])


est = [('lin_svc', lin_svc), ('svc', svc), ('log_reg', log_reg),
       ('tree', tree), ('forest', forest), ('SGDC', SGDC),
       ('knn', knn)]

vote_clf = VotingClassifier(estimators=est, voting='hard')

vote_clf.fit(X_training, y_training)


In [None]:
for clf in (lin_svc, svc, log_reg, tree, 
            forest, SGDC, knn, vote_clf):
    
    clf.fit(X_training, y_training)
    y_pred = clf.predict(X_verification)
    
    name = clf.__class__.__name__
    score = accuracy_score(y_verification, y_pred)
    
    print("{}: {:.2f}%".format(name, 100*score))

In [None]:
#fit to whole dataset
vote_clf.fit(X_train, y_train)

In [None]:
def make_csv(csv_name, save_loc=TITANIC_PATH):
    curr_path = os.getcwd()
    save_path = os.path.join(curr_path, save_loc)
    os.chdir(save_path)
    
    max_i = 0
    
    len_name = len(csv_name)
           
    for file in glob.glob(csv_name+'*.csv'):
        
        file_name = file[:len(file)-4]
        file_ver = file_name[len_name:]
        
        if int(file_ver) > max_i:
            max_i = int(file_ver)
        
    new_ver = csv_name+str(max_i+1)+'.csv'
        
        
        
    os.chdir(curr_path)
    
    return os.path.join(save_path, new_ver)
    

In [None]:
PassengerId = Test['PassengerId']

Survived_pred = vote_clf.predict(X_test) 


Submission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': Survived_pred })

name = "Submission"
file_name = make_csv(name)

Submission.to_csv(file_name, index=False)


In [None]:
Submission.head(5)