# Titanic Classification #

In [None]:
import pandas as pd
import numpy as np
import pickle

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2, f_classif

# models
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
# import xgboost

# gridsearch and pipelining
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ParameterGrid, KFold

# sklearn utils
from sklearn.externals import joblib

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

### Data Loading and Preprocessing ##

In [None]:
## Name constants for file locations. Assuming files in same folder as notebook
TRAIN_FILE = 'Lab5_train.csv'
TEST_FILE = 'Lab5_test.csv'

In [None]:
def get_original_df(PATH):
    '''Function to read a file into a CSV and return it'''
    df_original = pd.read_csv(PATH)
    return df_original

get_original_df(TRAIN_FILE).head()


### Feature Extraction ###

In [None]:
def get_title(name):
    '''Adding granular mapping for names'''
    title_lookup = {
                    'Mr':'Mr', 'Mrs':'Mrs', 'Miss':'Miss', 'Master':'Child', 
                    'Don':'Titled', 'Rev':'Titled', 'Ms':'Mrs', 'Mme':'Mrs', 'Mlle':'Miss',
                    'Dr':'Titled', 'Lady':'Titled', 'Col':'Officer', 'the Countess':'Titled',
                    'Jonkheer':'Titled', 'Major':'Officer', 'Capt':'Officer', 'Dona':'Titled', 'Sir':'Titled'
                }
    
    return title_lookup[extract_from_name(name, part='Title')]


def extract_from_name(name, part='LastName'):
    '''Extracting last name or title from name'''
    
    part_lookup = {'LastName':0, 'Title':1}
    
    parts = name.split(",")
#     parts[0] = parts[0][-3:]
    parts[1] = parts[1].split(".")[0].strip()
    
    return parts[part_lookup[part]]


def fill_in_median_val(passenger, column_name, measure_by_class):
    '''Fill in a column name using the class, sex and title measures.'''
    if np.isnan(passenger[column_name]):
        #This should be changed to include class, sex and title too
        meas = np.mean(\
                       measure_by_class[(measure_by_class["Sex"] == passenger['Sex']) & \
                                        (measure_by_class["Pclass"] == passenger['Pclass']) & \
                                        (measure_by_class["Title"] == passenger["Title"])\
                                       ][column_name])
        if np.isnan(meas):
            meas = np.mean(\
                           measure_by_class[\
                                            (measure_by_class["Sex"] == passenger['Sex']) & \
                                            (measure_by_class["Pclass"] == passenger['Pclass'])\
                                           ][column_name])
        passenger[column_name] = meas
    
    return passenger


def fill_in_fare(passenger, fare_by_class):
    if np.isnan(passenger['Fare']):
        passenger['Fare'] = fare_by_class[passenger['Pclass']]
    
    return passenger


def get_family_bin(passenger):
    if passenger['SibSp'] + passenger['ParCh'] + 1 > 3:
        passenger['FamilySize'] = 1
    else:
        passenger['FamilySize'] = 0
    
    return passenger


def get_is_mother(passenger):
    if passenger['ParCh'] > 0 and passenger['Title'] == 'Mrs':
        passenger['IsMother'] = 1
    else:
        passenger['IsMother'] = 0
    
    return passenger


def get_is_orphan(passenger):
    if passenger['ParCh'] == 0 and passenger['Age'] <= 8:
        passenger['isOrphan'] = 1
    else:
        passenger['isOrphan'] = 0
    
    return passenger

def ticket_has_known_survivor(passenger):
    ticket = passenger["Ticket"]
    
    train_df = get_original_df(TRAIN_FILE)
    survivor_tickets = train_df.groupby("Ticket")["Survived"].sum()
    total_tickets = train_df.groupby("Ticket")["Survived"].size()
    
    if ticket in survivor_tickets.keys():
        passenger["KnownSurvivors"] = survivor_tickets[ticket]/float(total_tickets[ticket])
    else:
        passenger["KnownSurvivors"] = 0
        
    return passenger
    
    
    
    

In [None]:
def featurize_data(df_train, df_test=None, mode='train'):
    '''
        This function featurises a raw dataset. 
        If a test set is provided in test mode, it combines it with
        the training set and calculates the features accordingly.
        
    '''
    df = df_train.copy()
    
    if mode == 'test':
        df_test["Survived"] = 2
        df = pd.concat([df_train, df_test])
    
    ## Extract featuers from name
    df["LastName"] = df.Name.map(extract_from_name)
    df["Title"] = df.Name.map(get_title)
    
    
    #Fill in missing age and fare by class using median values
    age_by_class = df.groupby(['Sex','Pclass','Title'], as_index=False).Age.median()
    fare_by_class = df.groupby(['Sex','Pclass','Title'], as_index=False).Fare.median()

    df = df.apply(lambda passenger: fill_in_median_val(passenger,'Age', age_by_class), axis=1)
    df = df.apply(lambda passenger: fill_in_median_val(passenger, 'Fare', fare_by_class), axis=1)
    
    
#     Divide age by two (and make it int) to reduce sparsity
#     df.Age = df.Age.map(lambda x: int(x)/2)
    
    # Calculate family size, adding 1 to include the person. 1 if above 3, else 0.
    df["FamilySize"] = 0
    df = df.apply(get_family_bin, axis=1)
    
    # Calculate family size, adding 1 to include the person. 1 if above 3, else 0.
#     df["Fare"] = df.apply(lambda x: x["Fare"]/(x['SibSp'] + x['ParCh'] + 1), axis=1)
#     df = df.apply(get_family_bin, axis=1)
    
    # Check if person is mother
    df["IsMother"] = 0
    df = df.apply(get_is_mother, axis=1) 
    
#     Check if ticket has survivors
#     df["KnownSurvivors"] = 0
#     df = df.apply(ticket_has_known_survivor, axis=1)
    
    # Create dummies for Sex
    sex = pd.get_dummies(df['Sex'])
    df = pd.concat([df, sex], axis=1)
    
    # Create dummies for Ticket. Didn't work out too well.
#     Ticket = pd.get_dummies(df['Ticket'])
#     df = pd.concat([df, Ticket], axis=1)
    
    # Create dummy variables for cabin after extracting first letter of cabin. Didn't work out too well.
    df["Cabin"] = df.Cabin.map(lambda x: "N/A" if x is np.nan else x[0])
#     df.Cabin = df.Cabin.map(lambda x: x[0])
#     Cabin = pd.get_dummies(df['Cabin'])
#     df = pd.concat([df, Cabin], axis=1)

    # Change cabin to indicate whether it is assigned or not, create dummies
    df["Cabin"] = df.Cabin.map(lambda x: "NoCabin" if x == "N/A" else "HasCabin")
    Cabin = pd.get_dummies(df['Cabin'])
    df = pd.concat([df, Cabin], axis=1)
    
    
    # Create dummies for embarkation
    embarkation = pd.get_dummies(df['Embarked'])
    df = pd.concat([df, embarkation], axis=1)

#     Create dummies based on family name. Didn't work out too well.
#     LastName = pd.get_dummies(df['LastName'])
#     df = pd.concat([df, LastName], axis=1)

    # Create dummies based on title
    Title = pd.get_dummies(df['Title'])
    df = pd.concat([df, Title], axis=1)

    # Drop the columns that have been onehot encoded
    df.drop(['Sex', 'LastName',
             'Embarked', 'Cabin', 'Title', 'PassengerID', 'Name', 'Ticket'], axis=1, inplace=True)
    
    #Drop training records if test set
    if mode == 'test':
        df = df[df.Survived == 2]
        
    # Drop survived if it is a training set
    if 'Survived' in df.columns:
        df.drop('Survived', axis=1, inplace=True)
    
    return df

## Create functions for creating, training, test sets and submitting files.

In [None]:
def clean_extra_features(X_test):
    '''Function to remove extra columns from test set, and add missing columns based on training set'''
    X, y = get_training_data()
    
    for col in X.columns:
        if col not in X_test.columns:
            X_test[col] = 0
            
    for col in X_test.columns:
        if col not in X.columns:
            X_test.drop([col], axis=1, inplace=True) 
    
    return X_test

def get_training_data():
    '''Returns original training data'''
    df_original = get_original_df(TRAIN_FILE)
    X = featurize_data(df_original)
    y = df_original['Survived'].copy()
    X.fillna(0, inplace=True)
    return X, y

def get_test_data():
    '''Returns original test data'''
    df_test_original = get_original_df(TEST_FILE)
    df_train_original = get_original_df(TRAIN_FILE)
    
    X_test = featurize_data(df_train_original, df_test_original, mode='test')
    
    X_test = clean_extra_features(X_test)
    
    X.fillna(0, inplace=True)
    return X_test

def get_sparse_test_data():
    '''Random testing file'''
    df_test_original = get_original_df(SPARSE_TEST_FILE)
    df_train_original = get_original_df(TRAIN_FILE)
    
    X = featurize_data(df_test_original)
    X = clean_extra_features(X_test=X)
    y = df_test_original['Survived'].copy()
    X.fillna(0, inplace=True)
    return X, y

In [None]:
## Create training data and split it
X, y = get_training_data()
# X_sparse, y_sparse = get_sparse_test_data()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=424242)

In [None]:
for col in X.columns:
    print(col)

### Baseline Classifiers ###

In [None]:
def create_submission_file(pipeline, sub_msg, retrain=True):
    '''This creates a submission file with or without retraining the model.'''
    X, y = get_training_data()
    X_test = get_test_data()
    
    df_test = get_original_df(TEST_FILE)
    
    if retrain:
        pipeline.fit(X, y)
    y_pred = pipeline.predict(X_test)
    
    submission_df = pd.concat([df_test.PassengerID, pd.DataFrame(y_pred)], axis=1)
    submission_df.columns = ['PassengerID','Survived']
    
    joblib.dump(pipeline, '../models/model_{}.pkl'.format(sub_msg)) 
    
    return submission_df.to_csv('../data/output/submission_{}.csv'.format(sub_msg), index=False)

def get_sparse_accuracy(pipeline, sub_msg="", retrain=True, repeat=1):
    '''Test function, based on test data -- ignore'''
    X, y = get_training_data()
    X_sparse_test, y_sparse_test = get_sparse_test_data()
    
    accuracy_scores = []
    for _ in range(repeat):
        if retrain:
            pipeline.fit(X, y)
        y_pred = pipeline.predict(X_sparse_test)
        accuracy_scores.append(accuracy_score(y_pred, y_sparse_test))
    
    return max(accuracy_scores)
    

In [None]:
def baseline_models(clfs, X_train, y_train, X_test, y_test):
    for clf in clfs:
        print("="*80)
        print(clf)
        get_accuracy_clf(clf, X_train, y_train, X_test, y_test)
        print()
        

def get_accuracy_clf(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('accuracy: {:.3f}%, {:.3f}%'.format(accuracy*100, get_sparse_accuracy(clf, sub_msg="")*100))

In [None]:
## Create a pipeline and test
scaler = StandardScaler()
feature_select = SelectKBest(f_classif, k=20)

eclf1 = VotingClassifier(estimators=[
                                    ('gbc', GradientBoostingClassifier()), 
                                    ('rr', RandomForestClassifier()), 
#                                     ('lr', LogisticRegression()),
#                                     ('nn', MLPClassifier()),
#                                     ('ab', AdaBoostClassifier()),
#                                     ('bnb', BernoulliNB()),
#                                     ('svc', SVC(kernel='linear',probability=True))
                                    ], voting='soft', weights=[3,2])

eclf2 = VotingClassifier(estimators=[
                                    ('gbc', GradientBoostingClassifier(max_depth=4)), 
                                    ('rr', RandomForestClassifier(max_depth=5)), 
                                    ('lr', LogisticRegression()),
#                                     ('nn', MLPClassifier())
                                    ], voting='soft')

pipeline1 = Pipeline([('scale', scaler), ('select_feat', feature_select), ('clf', eclf1)])
pipeline2 = Pipeline([('scale', scaler), ('select_feat', feature_select), ('clf', eclf2)])

clfs = [LogisticRegression(), GradientBoostingClassifier(), RandomForestClassifier(),eclf1, eclf2, pipeline1, pipeline2, AdaBoostClassifier(), MLPClassifier(), SVC(kernel='rbf',probability=True)]
baseline_models(clfs, X_train, y_train, X_val, y_val)

### Hyperparameter Tuning for individual models

In [None]:
def get_best_estimator(grid_search, X, y, is_pipeline=False):
    '''Return best grid search parameters and print details'''
    grid_search.fit(X, y)
    
    print("Best score: %0.3f" % grid_search.best_score_)
    print()
    print("Best parameters set:")
    if is_pipeline:
        best_parameters = grid_search.best_estimator_.get_params()
    
        for component in best_parameters:



            if component == 'clf':
                classifiers = best_parameters[component].get_params()
                for part in classifiers:
                    if '__' not in part:
                        print(part + ":")
                        print(classifiers[part])
                        print()
                continue
            if '__' not in component:
                print(component)
                print(best_parameters[component])
                print()
    #     print(best_parameters['clf'].get_params())
    
    return grid_search.best_estimator_
    
    

In [None]:
## Get best classifier models for the algorithms in the list. 
## The commented out code is a reference for the parameters available.

rr = (bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

gbm = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
accuracy: 79.235%, 71.066%

# ================================================================================
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#   decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
#   max_iter=-1, probability=True, random_state=None, shrinking=True,
#   tol=0.001, verbose=False)

clf_rr = RandomForestClassifier()
clf_lr = LogisticRegression()
clf_gb = GradientBoostingClassifier()


param_grid_rr = [{
                    'bootstrap':[True, False],
                    'class_weight':[None, 'balanced'],
                    'max_features':[None, 'auto'],
                    'n_estimators': [10, 20, 100, 200, 1000]
        
    }]

param_grid_gb = [{
                    'max_depth':[None, 2, 3, 4, 5],
                    'max_features':[None, 'auto'],
                    'n_estimators': [100, 200, 300],
                    'learning_rate': [0.1, 0.01, 0.001]
        
    }]

param_grid_lr = [{
                    'class_weight':[None, 'balanced'],
                    'C':[2**-5, 2**-3, 2**-1, 2**0, 2**2, 2**4, 2**6],
                    'penalty': ['l1', 'l2'],
        
    }]

# param_grid_svm = [{
#                     'alpha':[0.01, 0.1, 0.0001],
#                     'max_features':[None, 'auto'],
#                     'n_estimators': [50, 100, 200],
#                     'learning_rate': [0.1, 0.001]
        
#     }]

# param_grid_nn = [{
#                     'class_weight':[None, 'balanced'],
#                     'C':[2**-5, 2**-3, 2**-1, 2**0, 2**2, 2**4, 2**6],
#                     'penalty': ['l1', 'l2'],
        
#     }]

clfs = {clf_rr:param_grid_rr, clf_lr:param_grid_lr, clf_gb:param_grid_gb}

for clf in clfs.keys():
#     print(clf)
    print(get_sparse_accuracy(clf, repeat=10))
    grid_search = GridSearchCV(clf, clfs[clf], n_jobs=- 1, verbose=1, cv=10)
    clf = get_best_estimator(grid_search, X, y)
    print(get_sparse_accuracy(clf, repeat=10))
#     print(clf)
    print()
    print()


## Bagging - Voting Classifier

Warning - Run the grid search at your own risk. This usually is run with multiple parameters because our machine can handle it.

In [None]:
## This is to set up the parameter grid for the pipeline to enter cross validation and grid search


param_grid = [{
#                 'clf__lr__penalty':['l1', 'l2'],
                'clf__lr__C':[2**0, 2**0.5, 2**1, 2**5],
                'clf__lr__class_weight':['balanced', None],
#                 'clf__lr__penalty':['l1', 'l2'],
#         'clf__rr__criterion': ['gini','entropy'],
        'clf__rr__class_weight':[None, 'balanced'],
        'clf__rr__max_depth':[3,4, None],
#         'clf__rr__max_features':[15, 'auto'],
        'clf__rr__n_estimators':[10, 20, 50, 100, 200],
#         'clf__nn__activation': ['relu'],
#         'clf__nn__alpha':[0.001, 0.1],
#         'clf__nn__solver':['adam'],
#         'clf__nn__max_iter':[500],
#                 'clf__gbc__learning_rate':[0.1, 0.001],
                'clf__gbc__n_estimators':[100, 200, 500],
#                 'clf__gbc__max_features':['auto', None],
#                 'clf__gbc__warm_start':[True, False],
                'clf__voting':['soft','hard']
    }]

grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=5)

pipeline = get_best_estimator(grid_search, X, y)

As soon as the best estimator was found, a submission file was created without retraining the model.

In [None]:
create_submission_file(pipeline, sub_msg="best_estimator", retrain=False)

### Feature Importance

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X=X, y=y)
features = pd.DataFrame()
features['feature'] = X.columns
features['importance'] = clf.feature_importances_

model = SelectFromModel(clf, prefit=True)
train_new = model.transform(X)
train_new.shape, X.shape

In [None]:
features.sort(['importance'],ascending=False)

## Blending and Stacking

In [None]:
def get_blended_prediction_model(clfs, X, y):
    '''Train a list of classifiers and train a meta classifier on them'''
    train_predictions_a_df = pd.DataFrame()
    train_predictions_b_df = pd.DataFrame()
    test_predictions_df = pd.DataFrame()
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=424242)
    X_a_train, X_b_train, y_a_train, y_b_train = train_test_split(X_train, y_train, test_size=0.50, random_state=424242)
    
    for clf in clfs:
        clf.fit(X_a_train, y_a_train)
        y_predict_prob = clf.predict_proba(X_b_train)
        y_predict = clf.predict(X_b_train)
        train_predictions_a_df[str(clf).split("(")[0] + "_proba"] = pd.Series(y_predict_prob[:,0])
        train_predictions_a_df[str(clf).split("(")[0] + "_prediction"] = y_predict
        
        clf.fit(X_b_train, y_b_train)
        y_predict_prob = clf.predict_proba(X_a_train)
        y_predict = clf.predict(X_a_train)
        train_predictions_b_df[str(clf).split("(")[0] + "_proba"] = pd.Series(y_predict_prob[:,0])
        train_predictions_b_df[str(clf).split("(")[0] + "_prediction"] = y_predict
        
        
        clf.fit(X_train, y_train)
        y_predict_prob = clf.predict_proba(X_val)
        y_predict = clf.predict(X_val)
        
        test_predictions_df[str(clf).split("(")[0] + "_proba"] = pd.Series(y_predict_prob[:,0])
        test_predictions_df[str(clf).split("(")[0] + "_prediction"] = y_predict
    
    meta_model = LogisticRegression()
    
    meta_model.fit(train_predictions_a_df, y_b_train)
    meta_model.fit(train_predictions_b_df, y_a_train)
    y_pred = meta_model.predict(test_predictions_df)
    
    print("Meta Accuracy: {:.3f}%".format(accuracy_score(y_pred, y_val) * 100))
    
    return meta_model

def create_meta_pred_set(clfs, X, y, X_test):
    '''Get data for metamodel training'''
    predictions_df = pd.DataFrame()
    for clf in clfs:
        clf.fit(X, y)
        y_predict_prob = clf.predict_proba(X_test)
        y_predict = clf.predict(X_test)
        predictions_df[str(clf).split("(")[0] + "_proba"] = pd.Series(y_predict_prob[:,0])
        predictions_df[str(clf).split("(")[0] + "_prediction"] = y_predict
    return predictions_df
    


In [None]:
X, y = get_training_data()
X_test, y_test = get_sparse_test_data()

bl_clf = get_blended_prediction_model(clfs, X, y)

X_meta_test = create_meta_pred_set(clfs, X, y, X_test)
y_pred = bl_clf.predict(X_meta_test)
print(accuracy_score(y_pred, y_test))

In [None]:
def get_stacked_prediction_model(clfs, X, y, k=10):
    '''
        This trains a list of classifiers in cross validated 
        manner and then trains another meta classifier based on
        the predictions and probabilities of the original classifiers
    '''
    kf = KFold(n_splits=k, shuffle=False)

    train_predictions = []
    y_list = []
    X = X.as_matrix()

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        train_k_predictions_df = pd.DataFrame()
        for clf in clfs:
            clf.fit(X_train, y_train)
            y_predict_prob = clf.predict_proba(X_test)
            y_predict = clf.predict(X_test)
            train_k_predictions_df[str(clf).split("(")[0] + "_proba"] = pd.Series(y_predict_prob[:,0])
            train_k_predictions_df[str(clf).split("(")[0] + "_prediction"] = y_predict
        y_list.append(y_test)
        train_predictions.append(train_k_predictions_df)
    
    train_predictions_df = pd.concat(train_predictions)
    
    y = pd.concat(y_list) 
    X = train_predictions_df.as_matrix()
    
    stacked_model = LogisticRegression()
    
    stacked_model.fit(X, y)
    y_predict = stacked_model.predict(X)
    print(accuracy_score(y_predict, y))
    
    return stacked_model
    

In [None]:
clfs = [
#         LogisticRegression(), 
        GradientBoostingClassifier(), 
        RandomForestClassifier(),
        eclf1, 
#         eclf2, 
        pipeline1, 
#         pipeline2, 
#         AdaBoostClassifier(), 
#         MLPClassifier(), 
#         SVC(kernel='rbf',probability=True)
       ]

X, y = get_training_data()
X_test, y_test = get_sparse_test_data()

acc_scores = []

## Loop to see if if 
stacked_clf = get_stacked_prediction_model(clfs, X, y)

X_meta_test = create_meta_pred_set(clfs, X, y, X_test)
y_pred = stacked_clf.predict(X_meta_test)
    
score = accuracy_score(y_pred, y_test)
acc_scores.append(accuracy_score(y_pred, y_test))
    

df_test = get_original_df(TEST_FILE)
submission_df = pd.concat([df_test.PassengerID, pd.DataFrame(y_pred)], axis=1)
submission_df.columns = ['PassengerID','Survived']

sub_msg = "trial" + str(score)
    
joblib.dump((stacked_clf, clfs) , '../models/model_{}.pkl'.format(sub_msg)) 
submission_df.to_csv('../data/output/submission_{}.csv'.format(sub_msg), index=False)



In [None]:
print(max(acc_scores))

In [None]:
clf = joblib.load('../models/model_best.pkl')

In [None]:
clf.named_steps['clf'].estimators[2]