In [83]:
# Required pacakages
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [56]:
# Read data
titanic = pd.read_csv('data/titanic.csv')
# View the head
titanic.head()

Unnamed: 0,pclass,name,sex,age,fare,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,211.3375,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,151.55,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,151.55,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,151.55,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,151.55,S,0


In [57]:
# Show the info
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 7 columns):
pclass      1309 non-null int64
name        1309 non-null object
sex         1309 non-null object
age         1046 non-null float64
fare        1308 non-null float64
embarked    1307 non-null object
survived    1309 non-null int64
dtypes: float64(2), int64(2), object(3)
memory usage: 71.7+ KB


In [85]:
# Split the data into predictors and target
X = titanic.drop(['survived', 'name'], axis = 1)
y = titanic['survived']

In [86]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.25, stratify = y)

In [87]:
# Now, we will create a pipline for the numeric features
# Difine a list with the numeric features
numeric_features = ['age', 'fare']
# Define a pipeline for numeric features
numeric_features_pipeline = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy = 'median')), # Impute with median value for missing
    ('scaler', StandardScaler())                     # Conduct a scaling step
])

In [88]:
# Now, we will create a pipline for the categorical features
# Difine a list with the categorical features
categorical_features = ['embarked', 'sex']
# Define a pipeline for categorical features
categorical_features_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 'missing')), # Impute with the word 'missing' for missing values
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))     # Convert all categorical variables to one hot encoding
])

In [89]:
# Now, we will create a pipline for the ordinal features
# Define a list with the ordinal features
ordinal_features = ['pclass']
# Define a pipline for ordinal features 
ordinal_features_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories= [[1, 2, 3]]))
])

In [90]:
# Now, we will create a transformer to handle all columns
preprocessor = ColumnTransformer(transformers= [
    ('num', numeric_features_pipeline, numeric_features),        # transformer with name 'num' that will apply 'numeric_features_pipeline' to numeric_features
    ('cat', categorical_features_pipeline, categorical_features), # transformer with name 'cat' that will apply 'categorical_features_pipeline' to categorical_features
    ('ord', ordinal_features_pipeline, ordinal_features)
])

In [91]:
# Now, we will create a full prediction pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', LogisticRegression(solver = 'lbfgs'))])

In [92]:
# Let's fit our classifier
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [93]:
# Print classifier score
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.787


In [94]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))
print(grid_search.best_params_)

best logistic regression from grid search: 0.784
{'classifier__C': 1.0, 'preprocessor__num__imputer__strategy': 'mean'}
