<a href="https://colab.research.google.com/github/webstrum/Titanic-ml/blob/master/titanic_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
pip install xgboost

In [0]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import os

# Plot figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Control random number generator
np.random.seed(42)

# Ignore useless warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Get the Data

In [0]:
# List files in input direction

CURRENT_PATH = os.getcwd()
os.listdir(CURRENT_PATH)

['.config', 'sample_data']

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving test.csv to test.csv
User uploaded file "test.csv" with length 28629 bytes


In [0]:
os.listdir(CURRENT_PATH)

['.config', 'train.csv', 'test.csv', 'sample_data']

In [0]:
def load_titanic_data(file, data_path=CURRENT_PATH):
    return pd.read_csv(os.path.join(data_path, file))

load_train_df = load_titanic_data("train.csv")
load_test_df = load_titanic_data("test.csv")
copy_train_df = load_train_df.copy()
copy_test_df = load_test_df.copy()

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, 
                               random_state=42)

for train_id, val_id in split.split(copy_train_df, copy_train_df["Pclass"]):
    strat_train_df = copy_train_df.loc[train_id]
    strat_val_df = copy_train_df.loc[val_id]

## Data Preparation

In [0]:
copy_strat_train = strat_train_df.copy()
X_strat_train = copy_strat_train.drop("Survived", axis=1)
y_strat_train = copy_strat_train["Survived"].copy()

In [0]:
all_attribs = list(X_strat_train)
num_X_train = X_strat_train.select_dtypes(include=[np.number])
num_X_attribs = list(num_X_train)
cat_X_attribs = [attrib for attrib in all_attribs if (
    attrib not in num_X_attribs)]

In [0]:
drop_columns = ["PassengerId", "Cabin", "Ticket"]
X_strat_train.drop(drop_columns, axis=1, inplace=True)

In [0]:
all_attribs = list(X_strat_train)
correct_num_train = X_strat_train.select_dtypes(include=[np.number])
correct_num_attribs = list(correct_num_train)
correct_cat_attribs = [attrib for attrib in all_attribs if (
    attrib not in correct_num_attribs)]

In [0]:
X_num_attribs = correct_num_attribs.copy()
X_cat_attribs = correct_cat_attribs.copy()

In [0]:
from sklearn.impute import SimpleImputer

correct_num_train = X_strat_train.loc[:, correct_num_attribs]
num_imputer = SimpleImputer(strategy="median")
num_imputer.fit(correct_num_train)
tr_num_train = num_imputer.transform(correct_num_train)
df_num_train = pd.DataFrame(tr_num_train, columns=correct_num_train.columns, 
                     index=list(correct_num_train.index.values))

In [0]:
correct_cat_train = X_strat_train.loc[:, correct_cat_attribs]
cat_imputer = SimpleImputer(strategy="constant", 
                             fill_value="missing")
cat_imputer.fit(correct_cat_train)
tr_cat_train = cat_imputer.transform(correct_cat_train)
df_cat_train = pd.DataFrame(tr_cat_train, columns=correct_cat_train.columns,
                     index=list(correct_cat_train.index.values))

In [0]:
X_strat_train.update(df_num_train)
X_strat_train.update(df_cat_train)

In [0]:
from sklearn.preprocessing import FunctionTransformer

# get the right column indices
Age_ix, Pclass_ix, Name_ix = [
    list(X_strat_train.columns).index(col)
    for col in ("Age", "Pclass", "Name")]

def add_num_attribs(X):
    Age_Pclass = X[:, Age_ix] * X[:, Pclass_ix]
    return np.c_[X, Age_Pclass]
  
add_num = FunctionTransformer(add_num_attribs, validate=False)
add_num_train = add_num.fit_transform(X_strat_train.values)

extra_num_train = pd.DataFrame(add_num_train, 
                                 columns=list(X_strat_train.columns)+
                                ['Age_Pclass'])

correct_num_attribs.append('Age_Pclass')

In [0]:
def add_cat_attribs(X):
    series_name_value = pd.Series(X[:, Name_ix])
    filter_series_name = series_name_value.str.extract(r' ([A-Za-z]+)\.', 
                                              expand=False)
    transform_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    transform_title_series = filter_series_name.apply(
        lambda x: 'Rare' if x not in transform_titles else x)
    Title = np.array(transform_title_series)
    
    return np.c_[X, Title]
  
add_cat = FunctionTransformer(add_cat_attribs, validate=False)
add_cat_train = add_cat.fit_transform(X_strat_train.values)

extra_cat_train = pd.DataFrame(add_cat_train, 
                               columns=list(X_strat_train.columns)+
                                ["Title"])

In [0]:
extra_cat_train.drop('Name', axis=1, inplace=True)
correct_cat_attribs.remove("Name")
correct_cat_attribs.append('Title')

In [0]:
from sklearn.preprocessing import OneHotEncoder

cat_strat_train = extra_cat_train.loc[:, correct_cat_attribs]
encoder_cat = OneHotEncoder(sparse=False)
onehot_cat_train = encoder_cat.fit_transform(cat_strat_train)

In [0]:
from sklearn.preprocessing import StandardScaler

num_strat_train = extra_num_train.loc[:, correct_num_attribs]
scaler_num = StandardScaler()
scaler_num_train = scaler_num.fit_transform(num_strat_train)

## preprocessing pipelines

In [0]:
X_num_attribs

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [0]:
X_cat_attribs

['Name', 'Sex', 'Embarked']

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# We create the preprocessing pipelines.

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('add_num_features', FunctionTransformer(add_num_attribs, 
                                          validate=False)),
    ('scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', 
                              fill_value='missing')),
    ('add_cat_features', FunctionTransformer(add_cat_attribs, 
                                          validate=False)),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ])

pre_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, X_num_attribs),
        ('cat', cat_pipeline, X_cat_attribs),
    ])

In [0]:
copy_strat_train = strat_train_df.copy()
X_strat_train = copy_strat_train.drop("Survived", axis=1)
y_strat_train = copy_strat_train["Survived"].copy()

X_train_prepared = pre_pipeline.fit_transform(X_strat_train)
y_strat_train.shape

(712,)

In [0]:
copy_strat_val = strat_val_df.copy()
X_strat_val = copy_strat_val.drop("Survived", axis=1)
y_strat_val = copy_strat_val["Survived"].copy()

X_val_prepared = pre_pipeline.fit_transform(X_strat_val)
y_strat_val.shape

(179,)

## Select and Train a Model

In [0]:
# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from xgboost.sklearn import XGBClassifier  

# cross validation
from sklearn.model_selection import cross_val_score

# create table to compare MLA predictions

MLA = [RandomForestClassifier(random_state=42),
       SVC(random_state=42),
       KNeighborsClassifier(),
       DecisionTreeClassifier(random_state=42),
       SGDClassifier(random_state=42),
       XGBClassifier(random_state=42),
    ]

MLA_columns = ['MLA Name', 'MLA Parameters', "MLA Train Accuracy"]
MLA_compare = pd.DataFrame(columns=MLA_columns)
MLA_predict = copy_strat_train["Survived"]

row_index = 0

for alg in MLA:
    
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, "MLA Name"] = MLA_name
    MLA_compare.loc[row_index, "MLA Parameters"] = str(alg.get_params())

    # train
    train_scores = cross_val_score(alg, X_train_prepared, y_strat_train, 
                             scoring="accuracy", cv=10) 
    train_score_mean = round(train_scores.mean()*100, 2)
    MLA_compare.loc[row_index, "MLA Train Accuracy"] = train_score_mean
    
    
    # save MLA predictions
    alg.fit(X_train_prepared, y_strat_train)
    MLA_predict[MLA_name] = alg.predict(X_train_prepared)
    
    row_index += 1

MLA_compare.sort_values(by = ['MLA Train Accuracy'], ascending=False, inplace=True)
MLA_compare

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy
3,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",83.15
0,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",82.73
5,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",82.03
2,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",80.76
4,SGDClassifier,"{'alpha': 0.0001, 'average': False, 'class_wei...",78.81
1,SVC,"{'C': 1.0, 'cache_size': 200, 'class_weight': ...",71.63


## Fine-Tune the system

In [0]:
MLA_compare["MLA Parameters"][3]

"{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 42, 'splitter': 'best'}"

In [0]:
DTC_param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 4, 6, 8, 10, None],
        'random_state': [42]
    }

In [0]:
MLA_compare["MLA Parameters"][0]

"{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}"

In [0]:
RFC_param_grid = {
        'n_estimators': [10, 50, 100, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 4, 6, 8, 10, None],
        'oob_score': [True],
        'random_state': [42],
    }

In [0]:
MLA_compare["MLA Parameters"][5]

"{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 1, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1}"

In [0]:
XGBC_param_grid = {
    'learning_rate': [.01, .03, .05, .1, .25], 
    'max_depth': [0, 2, 3, 4, 6, 8, 10], 
    'n_estimators': [10, 50, 100, 300],
    'seed': [0, None],
}

In [0]:
MLA_compare["MLA Parameters"][2]

"{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}"

In [0]:
KNC_param_grid = {
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }

In [0]:
param_grid = [DTC_param_grid, RFC_param_grid, XGBC_param_grid, 
              KNC_param_grid,]

In [0]:
param_grid

[{'criterion': ['gini', 'entropy'],
  'max_depth': [2, 4, 6, 8, 10, None],
  'random_state': [42]},
 {'criterion': ['gini', 'entropy'],
  'max_depth': [2, 4, 6, 8, 10, None],
  'n_estimators': [10, 50, 100, 300],
  'oob_score': [True],
  'random_state': [42]},
 {'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.25],
  'max_depth': [0, 2, 3, 4, 6, 8, 10],
  'n_estimators': [10, 50, 100, 300],
  'seed': [0, None]},
 {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
  'n_neighbors': [1, 2, 3, 4, 5, 6, 7],
  'weights': ['uniform', 'distance']}]

In [0]:
from sklearn.model_selection import GridSearchCV

# get the best parameters for Algorithms

best_MLA = [DecisionTreeClassifier(random_state=42),
            RandomForestClassifier(random_state=42),
            XGBClassifier(random_state=42),
            KNeighborsClassifier(),
    ]
best_MLA_columns = ['MLA Name', 'MLA Best Parameters', 
                    'MLA Train Accuracy', 'MLA Val Accuracy']
best_MLA_compare = pd.DataFrame(columns=best_MLA_columns)

final_models = {}

for best_idx, best_alg in enumerate(best_MLA):
    
    MLA_name = best_alg.__class__.__name__
    best_MLA_compare.loc[best_idx, "MLA Name"] = MLA_name
    
    grid_search = GridSearchCV(best_alg, param_grid[best_idx], 
                               cv=10, scoring='accuracy')
    grid_search.fit(X_train_prepared, y_strat_train)
    best_MLA_compare.loc[best_idx, "MLA Best Parameters"] = \
        str(grid_search.best_params_)
    best_MLA_compare.loc[best_idx, "MLA Train Accuracy"] = \
        grid_search.best_estimator_.score(X_train_prepared, 
                                          y_strat_train)

    final_models['fm_'+MLA_name] = grid_search.best_estimator_
    
    grid_search.fit(X_val_prepared, y_strat_val)
    best_MLA_compare.loc[best_idx, "MLA Val Accuracy"] = \
        grid_search.best_estimator_.score(X_val_prepared, 
                                          y_strat_val)
    

    print(MLA_name, '\t', best_idx, grid_search.best_estimator_, '\n'*3)

best_MLA_compare.sort_values(by=['MLA Val Accuracy'], 
                             ascending=False, inplace=True)
best_MLA_compare



DecisionTreeClassifier 	 0 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 







RandomForestClassifier 	 1 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False) 



