# Kaggle: Titanic Challenge
## Coded by Daniel Wilcox

This is a notebook showing the process in predicting the survivors of the Titanic.

In [172]:
import os
import pickle
import glob



import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.base import clone

#Fills in values to empty data locations
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Creating custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer




from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV


from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import VotingClassifier


In [2]:
#The Location to save the dataset
TITANIC_PATH = "datasets/titanic"
TITANIC_URL = "https://github.com/Daniel-Wilcox/ADA-874-2019/blob/master/datasets/titanic/"
train_name = "train.csv" 
test_name = "test.csv" 


#The Location to save the models
PICKLE_PATH = "PickleModels/Titanic"

In [3]:
#Pickle functions

#Saving and storing the model
def save_pickle(model_name, model, pic_path=PICKLE_PATH):
    print("Saving model...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
        
    f = open(model_name, "wb")
    pickle.dump(model, f)
    f.close()
    
    os.chdir(cwd)
    print("Saved "+model_name+" successfully!\n")
    return None
    
    
#Retrieving and loading the model
def load_pickle(model_name, pic_path=PICKLE_PATH):
    print("Loading "+model_name+" from Pickle file...")
    
    cwd = os.getcwd()
    os.chdir(cwd+"/"+pic_path)
    
    f = open(model_name, "rb")
    p = pickle.load(f)
    f.close()
    
    os.chdir(cwd)
    print(model_name+" successfully loaded!\n")
    return p

#Check whether the pickel exists
def pickle_exist(model_name, pic_path=PICKLE_PATH):
    #check if pickle file exists
    print("Checking if pickle directory exists...")
    if not os.path.isdir(pic_path):
        os.makedirs(pic_path)
        print("Directory does NOT exists")
        print("Creating directory")
    
    else: 
        print("Directory exists")
        
    if os.path.isfile(pic_path+"/"+model_name):
        print("Pickle file does exists...")
        return True
    else:
        print("Pickle file does NOT exists...")
        return False

In [4]:
def load_Titanic_data(file_name, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    return pd.read_csv(csv_path)
        
    
def get_Titanic_data(file_name, titanic_url=TITANIC_URL, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file_name)
    
    print("Checking if directory exists...")
    if not os.path.isdir(titanic_path):
        os.makedirs(titanic_path)
        print("Creating directory")
    
    else: 
        print("Directory exists") 
            
        if os.path.isfile(csv_path):
            print(file_name + " file does exists...")
            print("extracting " + file_name)
            
            titanic = load_Titanic_data(file_name)
            print("\nSuccess!")
            return titanic
        
        else:
            print(file_name + " file doesn't exists...")
            print("Download .csv from Kaggle!")

            return None
        
    
    
            

In [5]:
Train = get_Titanic_data(train_name)
Test = get_Titanic_data(test_name)


Checking if directory exists...
Directory exists
train.csv file does exists...
extracting train.csv

Success!
Checking if directory exists...
Directory exists
test.csv file does exists...
extracting test.csv

Success!


In [6]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
Test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB



### Variable - Definition                              - Key
1. survival - Survival                                - 0/1 = No/Yes
2. pclass   - Ticket class                            - 1,2,3 = 1st, 2nd, 3rd class
3. sex      - Sex                                     - male, female
4. Age      - Age in years                            - ...
5. sibsp    - # of siblings/spouses on the Titanic    - ...
6. parch    - # of parents/children on the Titanic    - ...
7. ticket   - Ticket number                           - ...
8. fare     - Passenger fare                          - ...
9. cabin    - Cabin number                            - ...
10. embarked - Port of Embarkation                     - C = Cherbourg, Q = Queenstown, S = Southampton


In [8]:
Train.head(3)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [9]:
sur = Train["Survived"].value_counts() / len(Train)
print("{:.2f}% Survived\n{:.2f}% Died".format(100*sur[1],100*sur[0]))

38.38% Survived
61.62% Died


In [10]:
#Correlation matrix
corr_matrix = Train.corr()
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

In [12]:
#Separtate labels from features
y_train = Train["Survived"].copy()
X_tr = Train.drop("Survived", axis=1)


In [13]:
#Remove Features (survived is already removed)
list_drop = ['Name', 'Ticket', 'Cabin']

Dropped = X_tr[list(set(X_tr.columns) - set(list_drop))]

#Numeric Features
list_num = Dropped.select_dtypes(include = ["number"]).columns

#Catagorical Features (to be transformed into OHE)
list_cat = Dropped.select_dtypes(include = ["object"]).columns


In [14]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        return(self)
    
    def transform(self, X):
        return X[self.feature_names].values    

In [15]:
#Numeric Transformations
num_pipeline = Pipeline([
    ('selector', Selector(list_num)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

#Catagorical Transformations
cat_pipeline = Pipeline([
    ('selector', Selector(list_cat)),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

In [16]:
perpare_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

X_train = perpare_pipeline.fit_transform(X_tr)
X_train.shape

(891, 11)

In [17]:
#Scaled Test Set for predictions:
X_test = perpare_pipeline.fit_transform(Test)
X_test.shape

(418, 11)

In [18]:
#To create new model if pickle already even if pickle exists; load_pkl = False
load_pkl = True

    
#Linear Support Vector Classifier
if pickle_exist("lin_clf") and load_pkl:
    lin_clf = load_pickle("lin_clf")
else:
    lin_clf = LinearSVC()
    lin_clf.fit(X_train, y_train)
    save_pickle("lin_clf", lin_clf)
    
    
cvs = cross_val_score(lin_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("lin_clf 1: {:.2f}%\nlin_clf 2: {:.2f}%\nlin_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))


Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading lin_clf from Pickle file...
lin_clf successfully loaded!

lin_clf 1: 80.13%
lin_clf 2: 79.80%
lin_clf 3: 78.11%





In [20]:
#Support Vector Machine Classifier    
if pickle_exist("svc_clf") and load_pickle:
    svc_clf = load_pickle("svc_clf")
else:
    svc_clf = SVC()
    svc_clf.fit(X_train, y_train)
    save_pickle("svc_clf", svc_clf)
    
    
cvs = cross_val_score(svc_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("svc_clf 1: {:.2f}%\nsvc_clf 2: {:.2f}%\nsvc_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading svc_clf from Pickle file...
svc_clf successfully loaded!

svc_clf 1: 81.48%
svc_clf 2: 82.83%
svc_clf 3: 83.16%





In [21]:
#Logistic Regression 
if pickle_exist("log_reg_clf") and load_pickle:
    log_reg_clf = load_pickle("log_reg_clf")
else:
    log_reg_clf = LogisticRegression()
    log_reg_clf.fit(X_train, y_train)
    save_pickle("log_reg_clf", log_reg_clf)
    
    
cvs = cross_val_score(log_reg_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("log_reg_clf 1: {:.2f}%\nlog_reg_clf 2: {:.2f}%\nlog_reg_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved log_reg_clf successfully!

log_reg_clf 1: 78.79%
log_reg_clf 2: 77.78%
log_reg_clf 3: 79.80%





In [22]:
#Decision Tree Classifier
if pickle_exist("tree_clf") and load_pickle:
    tree_clf = load_pickle("tree_clf")
else:
    tree_clf = DecisionTreeClassifier()
    tree_clf.fit(X_train, y_train)
    save_pickle("tree_clf", tree_clf)
    
    
cvs = cross_val_score(tree_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("tree_clf 1: {:.2f}%\ntree_clf 2: {:.2f}%\ntree_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved tree_clf successfully!

tree_clf 1: 59.60%
tree_clf 2: 61.62%
tree_clf 3: 71.72%



In [23]:
#RandomForestClassifier
if pickle_exist("randf_clf") and load_pickle:
    randf_clf = load_pickle("randf_clf")
else:
    randf_clf = RandomForestClassifier()
    randf_clf.fit(X_train, y_train)
    save_pickle("randf_clf", randf_clf)
    
    
cvs = cross_val_score(randf_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("randf_clf 1: {:.2f}%\nrandf_clf 2: {:.2f}%\nrandf_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved randf_clf successfully!

randf_clf 1: 81.14%
randf_clf 2: 80.81%
randf_clf 3: 83.16%





In [24]:
#Stochastic Gradient Descent Classifier  
if pickle_exist("sgd_clf") and load_pickle:
    sgd_clf = load_pickle("sgd_clf")
else:
    sgd_clf = SGDClassifier()
    sgd_clf.fit(X_train, y_train)
    save_pickle("sgd_clf", sgd_clf)
    
    
cvs = cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("sgd_clf 1: {:.2f}%\nsgd_clf 2: {:.2f}%\nsgd_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved sgd_clf successfully!

sgd_clf 1: 72.39%
sgd_clf 2: 76.43%
sgd_clf 3: 79.46%





In [25]:
#K-nearest neighbors Classifier
if pickle_exist("knn_clf") and load_pickle:
    knn_clf = load_pickle("knn_clf")
else:
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_train)
    save_pickle("knn_clf", knn_clf)
    
    
cvs = cross_val_score(knn_clf, X_train, y_train, cv=3, scoring="accuracy")    

print("knn_clf 1: {:.2f}%\nknn_clf 2: {:.2f}%\nknn_clf 3: {:.2f}%\n"
      .format(cvs[0]*100, cvs[1]*100, cvs[2]*100))

Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved knn_clf successfully!

knn_clf 1: 77.44%
knn_clf 2: 81.14%
knn_clf 3: 80.47%



In [61]:
def model_cross_val(clf, name_clf, lp=load_pickle, cv=cv_split):
    if pickle_exist(str(name_clf)) and lp:
        ml_clf = load_pickle(str(name_clf))
    else:
        ml_clf = clf
        ml_clf.fit(X_train, y_train)
        save_pickle(str(name_clf), ml_clf)
    cvs =  cross_val_score(ml_clf, X_train, y_train, cv=3, scoring="accuracy") 
    return  np.mean(cvs)


In [89]:
clf_options = [          
    LinearSVC(),
    SVC(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SGDClassifier(),
    KNeighborsClassifier()
    ]          
 
compare_col = ['Clf Name', 'Clf Parameters', 'Clf Mean Accuracy'] 
clf_compare = pd.DataFrame(columns = compare_col)
          
          
row=0
          
for clf in clf_options: 
    clf_name = clf.__class__.__name__
                    
    clf_compare.loc[row, 'Clf Name'] = clf_name
    clf_compare.loc[row, 'Clf Parameters'] = str(clf.get_params()) 
    cvs = model_cross_val(clf, clf_name)
    
    clf_compare.loc[row, 'Clf Mean Accuracy'] = cvs
    
    row +=1

               

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading LinearSVC from Pickle file...
LinearSVC successfully loaded!

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading SVC from Pickle file...
SVC successfully loaded!

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading LogisticRegression from Pickle file...
LogisticRegression successfully loaded!

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading DecisionTreeClassifier from Pickle file...
DecisionTreeClassifier successfully loaded!

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading RandomForestClassifier from Pickle file...
RandomForestClassifier successfully loaded!





Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading SGDClassifier from Pickle file...
SGDClassifier successfully loaded!

Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading KNeighborsClassifier from Pickle file...
KNeighborsClassifier successfully loaded!



In [90]:
clf_compare.sort_values(by = 'Clf Mean Accuracy', ascending = False, inplace = True)
clf_compare

Unnamed: 0,Clf Name,Clf Parameters,Clf Mean Accuracy
1,SVC,"{'C': 1.0, 'cache_size': 200, 'class_weight': ...",0.824916
6,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.796857
0,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...",0.79349
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.792368
2,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.787879
3,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.65881
5,SGDClassifier,"{'alpha': 0.0001, 'average': False, 'class_wei...",0.643098


In [112]:
#Parameters of classifiers:

#Linear SVC
lin_svc_param = {
    'loss':('hinge','squared_hinge'),
    'C':[1, 2, 5, 10]
}

#SVC
svc_param = {
    'C' : [1, 2, 5, 10],
    'kernel' : ['linear', 'rbf'],
    'gamma': [ 0.001, 0.01, 0.1, 1],
}

#Logistic Regression
log_reg_param = {
    'C': [1, 2, 5, 10],
    'penalty' : ['l1', 'l2']
}

#Decision Tree Classifier
tree_param = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth' : [1, 2, 3, 5]
}

#Random Forest Classifier
forest_param = {
    'n_estimators' : [100, 200, 500],
    'criterion' : ['gini'],
    'max_features' : [1, 3, 5, 10],
    'min_samples_split' : [2, 3, 5, 10],
    'min_samples_leaf' : [1, 2, 3, 10]
}

#SGDClassifier
SGDC_param = {
    'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10]
}

#K-Neighbors Classifier
k_neigh_param = {
    'n_neighbors': [3, 4, 5],
    'weights': ['uniform','distance']
}

param_option = [
    lin_svc_param,
    svc_param,
    log_reg_param,
    tree_param,
    forest_param,
    SGDC_param,
    k_neigh_param 
]

In [113]:

def model_grid(clf, name_clf, clf_param, lp=load_pickle, cv=cv_split): 

    gs_clf = GridSearchCV(clf, param_grid=clf_param, cv=cv_split, 
                          verbose=2, n_jobs=-1, scoring='accuracy')
    
    gs_clf.fit(X_train, y_train)
    
    best_est = gs_clf.best_estimator_
    best_sco = gs_clf.best_score_ 
    
    text_check = os.path.isfile(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt")
    
    if pickle_exist("best_"+str(name_clf)) and text_check and lp:
        #load current best score
        prev_best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_"+str(name_clf)+".txt"), dtype=float))

        if best_sco > prev_best_score:
            temp = [best_sco, best_sco]
            np.savetxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
            save_pickle("best_"+str(name_clf), best_est)
        else:
            #load in better parameters
            best_sco = max(np.loadtxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", dtype=float))
            best_est = load_pickle("best_"+str(name_clf))
    else:
        #make pickles if dont exist
        temp = [best_sco, best_sco]
        np.savetxt(PICKLE_PATH+"/best_score_"+str(name_clf)+".txt", temp, fmt='%f')
        save_pickle("best_"+str(name_clf), best_est)

    
    

In [114]:
best_col = ['Clf Name', 'Best Clf Parameters', 'Best Clf Accuracy Score'] 
best_compare = pd.DataFrame(columns = best_col)
          
          
row=0

for clf, param in zip(clf_options, param_option): 
    
    clf_name = clf.__class__.__name__
    best_compare.loc[row, 'Clf Name'] = clf_name
    
    print("{}: {} - {}".format(row, clf_name, param))
    
    model_grid(clf, clf_name, param)
    
    best_score = max(np.loadtxt((PICKLE_PATH+"/best_score_"+str(clf_name)+".txt"), dtype=float))
    best_clf = load_pickle("best_"+str(clf_name))
    
    best_compare.loc[row, 'Best Clf Parameters'] = str(best_clf.get_params())
    best_compare.loc[row, 'Best Clf Accuracy Score'] = str(best_score)
    
    row +=1
                    

0: LinearSVC - {'loss': ('hinge', 'squared_hinge'), 'C': [1, 2, 5, 10]}
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:    2.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading best_LinearSVC from Pickle file...
best_LinearSVC successfully loaded!

Loading best_LinearSVC from Pickle file...
best_LinearSVC successfully loaded!

1: SVC - {'C': [1, 2, 5, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]}
Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Done  32 out of  96 | elapsed:    0.5s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of  24 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading best_SVC from Pickle file...
best_SVC successfully loaded!

Loading best_SVC from Pickle file...
best_SVC successfully loaded!

2: LogisticRegression - {'C': [1, 2, 5, 10], 'penalty': ['l1', 'l2']}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Saving model...
Saved best_LogisticRegression successfully!

Loading best_LogisticRegression from Pickle file...
best_LogisticRegression successfully loaded!

3: DecisionTreeClassifier - {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 2, 3, 5]}
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done   8 out of  48 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Checking if pickle directory exists...
Directory exists
Pickle file does exists...
Loading best_DecisionTreeClassifier from Pickle file...
best_DecisionTreeClassifier successfully loaded!

Loading best_DecisionTreeClassifier from Pickle file...
best_DecisionTreeClassifier successfully loaded!

4: RandomForestClassifier - {'n_estimators': [100, 200, 500], 'criterion': ['gini'], 'max_features': [1, 3, 5, 10], 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [1, 2, 3, 10]}
Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   28.3s
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:   50.8s finished


Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved best_RandomForestClassifier successfully!

Loading best_RandomForestClassifier from Pickle file...
best_RandomForestClassifier successfully loaded!

5: SGDClassifier - {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]}
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved best_SGDClassifier successfully!

Loading best_SGDClassifier from Pickle file...
best_SGDClassifier successfully loaded!

6: KNeighborsClassifier - {'n_neighbors': [3, 4, 5], 'weights': ['uniform', 'distance']}
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Checking if pickle directory exists...
Directory exists
Pickle file does NOT exists...
Saving model...
Saved best_KNeighborsClassifier successfully!

Loading best_KNeighborsClassifier from Pickle file...
best_KNeighborsClassifier successf

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.1s finished


In [115]:
best_compare.sort_values(by = 'Best Clf Accuracy Score', ascending = False, inplace = True)
best_compare

Unnamed: 0,Clf Name,Best Clf Parameters,Best Clf Accuracy Score
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.806331
1,SVC,"{'C': 1, 'cache_size': 200, 'class_weight': No...",0.798883
3,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'entropy',...",0.793296
0,LinearSVC,"{'C': 10, 'class_weight': None, 'dual': True, ...",0.787709
2,LogisticRegression,"{'C': 2, 'class_weight': None, 'dual': False, ...",0.785847
5,SGDClassifier,"{'alpha': 0.01, 'average': False, 'class_weigh...",0.780261
6,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.769088


In [155]:
def load_best(clf):
    clf_name = clf.__class__.__name__
    return load_pickle("best_"+str(clf_name))
    

In [156]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, ver_index in split.split(Train, Train["Survived"]):
    Train_strat = Train.loc[train_index]
    Ver_strat = Train.loc[ver_index]

print("Training set: {} entries, Verificaiton set: {} entries".format(len(Train_strat),len(Ver_strat)))

Training set: 712 entries, Verificaiton set: 179 entries


In [160]:
y_training = Train_strat["Survived"].copy()
X_tr = Train_strat.drop("Survived", axis=1)
X_training = perpare_pipeline.fit_transform(X_tr)

y_verification = Ver_strat["Survived"].copy()
X_ver = Ver_strat.drop("Survived", axis=1)
X_verification = perpare_pipeline.fit_transform(X_ver)

In [161]:
lin_svc = load_best(clf_options[0])
svc = load_best(clf_options[1])
log_reg = load_best(clf_options[2])
tree = load_best(clf_options[3])
forest = load_best(clf_options[4])
SGDC = load_best(clf_options[5])
knn = load_best(clf_options[6])


est = [('lin_svc', lin_svc), ('svc', svc), ('log_reg', log_reg),
       ('tree', tree), ('forest', forest), ('SGDC', SGDC),
       ('knn', knn)]

vote_clf = VotingClassifier(estimators=est, voting='hard')

vote_clf.fit(X_training, y_training)


Loading best_LinearSVC from Pickle file...
best_LinearSVC successfully loaded!

Loading best_SVC from Pickle file...
best_SVC successfully loaded!

Loading best_LogisticRegression from Pickle file...
best_LogisticRegression successfully loaded!

Loading best_DecisionTreeClassifier from Pickle file...
best_DecisionTreeClassifier successfully loaded!

Loading best_RandomForestClassifier from Pickle file...
best_RandomForestClassifier successfully loaded!

Loading best_SGDClassifier from Pickle file...
best_SGDClassifier successfully loaded!

Loading best_KNeighborsClassifier from Pickle file...
best_KNeighborsClassifier successfully loaded!





VotingClassifier(estimators=[('lin_svc', LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0....ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [162]:
for clf in (lin_svc, svc, log_reg, tree, 
            forest, SGDC, knn, vote_clf):
    
    clf.fit(X_training, y_training)
    y_pred = clf.predict(X_verification)
    
    name = clf.__class__.__name__
    score = accuracy_score(y_verification, y_pred)
    
    print("{}: {:.2f}%".format(name, 100*score))



LinearSVC: 79.33%
SVC: 81.56%
LogisticRegression: 79.33%
DecisionTreeClassifier: 79.89%
RandomForestClassifier: 79.89%
SGDClassifier: 79.33%
KNeighborsClassifier: 77.09%




VotingClassifier: 81.56%




In [163]:
#fit to whole dataset
vote_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lin_svc', LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0....ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [282]:
#FIX THIS PROPER

def make_csv(csv_name, save_loc=TITANIC_PATH):
    curr_path = os.getcwd()
    save_path = os.path.join(curr_path, save_loc)
    os.chdir(save_path)
    
    max_i = 0
    
    len_name = len(csv_name)
           
    for file in glob.glob(csv_name+'*.csv'):
        
        file_name = file[:len(file)-4]
        file_ver = file_name[len_name:]
        
        if int(file_ver) > max_i:
            max_i = int(file_ver)
        
    new_ver = csv_name+str(max_i+1)+'.csv'
        
        
        
    os.chdir(curr_path)
    
    return os.path.join(save_path, new_ver)
    

In [285]:
PassengerId = Test['PassengerId']

Survived_pred = vote_clf.predict(X_test) 


Submission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Survived': Survived_pred })

name = "Submission"
file_name = make_csv(name)

Submission.to_csv(file_name, index=False)


In [287]:
Submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [269]:
os.chdir('/Users/Daniel/Desktop/ADA/WeekExercises/GitHubSaves/ADA-874-2019') 