In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

## Modeling Titanic Surviors Using Random Forest and Logistic Regression
with visualization and hyperparameter tuning

In [1]:
# import pandas as pd
exec(os.environ['IREWR_IMPORTS'])
import numpy as np
import re

# FIRST-AUTHOR: remove plotting, ML code
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set(style="whitegrid", color_codes=True)
# %matplotlib inline

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import ExtraTreesClassifier   # used to get feature importance
# from sklearn.feature_selection import SelectFromModel
# from sklearn import preprocessing
# from sklearn.linear_model import LogisticRegression

# from sklearn import svm
#from sklearn.modelselection import KFold, cross_val_score

## 2. Load and Example Data

In [2]:
# load both training set and test set
train_set = pd.read_csv("./input/train.scaled.csv")
test_set = pd.read_csv("./input/train.scaled.csv")

# Example the data
train_set.info()
print("-------------------------------------")
test_set.info()
# print("-------------------------------------")
# print(train_set.head())
# print("-------------------------------------")
# print(test_set.head())
print("-------------------------------------")
train_set.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
-------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId 

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### 2.1 Combined both data sets so we can handle feature engineering in one data frame

In [3]:
# Add a empyt Survived column on the test set
# Later on we will separate two sets using this column
test_set['Survived'] = np.NaN
alldata = pd.concat([train_set, test_set], ignore_index=True)

In [4]:
alldata.info() # we need to convert the Survived column back to int before model fitting

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1782 entries, 0 to 1781
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1782 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1782 non-null   int64  
 3   Name         1782 non-null   object 
 4   Sex          1782 non-null   object 
 5   Age          1428 non-null   float64
 6   SibSp        1782 non-null   int64  
 7   Parch        1782 non-null   int64  
 8   Ticket       1782 non-null   object 
 9   Fare         1782 non-null   float64
 10  Cabin        408 non-null    object 
 11  Embarked     1778 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 167.2+ KB


### 2.2. Handle missing data/feature imputation
#### 2.2.1. Cabin column
We added Cabin after we have done few iterations of model fitting however It did not help to improve the model. So we dropped this feature before creating the final model.

In [5]:
# replacing missing cabins with N (for No)
# alldata['Cabin'].fillna('N',inplace=True)
    
# # only keep cabin letter as it might indicate the levels and positions of the cabins
# alldata['Cabin'] = alldata['Cabin'].map(lambda x : x[0])
# cabin_dummies = pd.get_dummies(alldata['Cabin'],prefix='Cabin')
    
# alldata = alldata.join(cabin_dummies)

# Lets start to keep track of columns to be dropped
# We keep the original column for now so we can compare the values of some of the transfered columns
# to make sure that there is no bug. For dummy variables, this is most unlikely as it is created by a method call.
# But some of transformation performed manually later might introduce bugs
drop_list = ['Cabin']

#### 2.2.2. A lot of missing data for Age column
There are 177 missing values out 891 in the training set alone. Instead of random assigning values, we will try to intelligently guessing some of the values using the titles of the passengers in the Name column.

In [6]:
# Create a Title column
alldata['Title'] = alldata['Name'].apply(lambda x: re.sub('(.*, )|(\\..*)','', x))

In [7]:
# print(alldata[alldata['Age'].isnull()].groupby('Title').size())
# print('--------------------------------------------------------')
# alldata[alldata['Age'].notnull()].groupby('Title')['Age'].agg({'Count': np.size, 
#                               'Min': np.min, 'Max': np.max, 'Avg': np.mean, 'Std': np.std})          

In [8]:
# Create a new Age2 column
alldata['Age2'] = alldata['Age']

# There are 8 empty Age column with title Master in the Name column
# We will randomly pick between 0 and 14 as the ages for these 4 records
kids_no_age = (alldata['Title'] == 'Master') & alldata.Age.isnull()
# FIRST-AUTHOR: make notebook run
# alldata.ix[kids_no_age, 'Age2'] = np.random.randint(0, 14, 8)
alldata.loc[kids_no_age, 'Age2'] = np.random.randint(0, 14, alldata.loc[kids_no_age].shape[0])

# Only one Dr has missing value, lets use the mean from doctor with ages
dr_no_age = (alldata['Title'] == 'Dr') & alldata.Age.isnull()
dr_with_age = (alldata['Title'] == 'Dr') & alldata.Age.notnull()
# FIRST-AUTHOR: make notebook run
# alldata.ix[dr_no_age, 'Age2'] = alldata[dr_with_age]['Age'].mean()
alldata.loc[dr_no_age, 'Age2'] = alldata[dr_with_age]['Age'].mean()

# Only one Ms has missing value and one with value
ms_no_age = (alldata['Title'] == 'Ms') & alldata.Age.isnull()
ms_with_age = (alldata['Title'] == 'Ms') & alldata.Age.notnull()
# FIRST-AUTHOR: make notebook run
# alldata.ix[ms_no_age, 'Age2'] = int(alldata[ms_with_age]['Age'].mean())
alldata.loc[ms_no_age, 'Age2'] = int(alldata[ms_with_age]['Age'].mean())

# Use average for each title grop to fill up the rest of null values
# We could also combine SibSp and Parch columns to make more educated guess
# But lets see how we do without that
# Mr
mr_with_age = (alldata['Title'] == 'Mr') & alldata.Age.notnull()
min_mr_age = min(alldata[mr_with_age]['Age'])
max_mr_age = max(alldata[mr_with_age]['Age'])               
mr_no_age = (alldata['Title'] == 'Mr') & alldata.Age.isnull()
# alldata.ix[mr_no_age, 'Age2'] =  np.random.randint(min_mr_age, max_mr_age, len(alldata.ix[mr_no_age]))
# FIRST-AUTHOR: make notebook run
# alldata.ix[mr_no_age, 'Age2'] = np.median(alldata[mr_with_age]['Age'])
alldata.loc[mr_no_age, 'Age2'] = np.median(alldata[mr_with_age]['Age'])
# Miss
miss_with_age = (alldata['Title'] == 'Miss') & alldata.Age.notnull()
miss_no_age = (alldata['Title'] == 'Miss') & alldata.Age.isnull()
min_miss_age = min(alldata[miss_with_age]['Age'])
max_miss_age = max(alldata[miss_with_age]['Age'])
# alldata.ix[miss_no_age, 'Age2'] = np.random.randint(min_miss_age, max_miss_age, len(alldata.ix[miss_no_age]))
# FIRST-AUTHOR: make notebook run
# alldata.ix[miss_no_age, 'Age2'] = np.median(alldata[miss_with_age]['Age'])
alldata.loc[miss_no_age, 'Age2'] = np.median(alldata[miss_with_age]['Age'])
# Mrs
mrs_with_age = (alldata['Title'] == 'Mrs') & alldata.Age.notnull()
mrs_no_age = (alldata['Title'] == 'Mrs') & alldata.Age.isnull()
min_mrs_age = min(alldata[mrs_with_age]['Age'])
max_mrs_age = max(alldata[mrs_with_age]['Age'])
# alldata.ix[mrs_no_age, 'Age2'] = np.random.randint(min_mrs_age, max_mrs_age, len(alldata.ix[mrs_no_age]))
# FIRST-AUTHOR: make notebook run
# alldata.ix[mrs_no_age, 'Age2'] = np.median(alldata[mrs_with_age]['Age'])
alldata.loc[mrs_no_age, 'Age2'] = np.median(alldata[mrs_with_age]['Age'])

alldata['Age2'] = alldata.Age2.astype(int)

# Lets drop Age column late
drop_list.append('Age')

#### 2.2.3. Missing fare (only one missing record)

In [9]:
alldata["Fare"].fillna(alldata["Fare"].median(), inplace=True)

## 3. Exploratory Analysis + Feature Engineering/Transformation

### 3.1. Transfer Sex column
Transfer Sex column to three dummy variables(Child, Adult_Female, Adult_Male)

In [10]:
# As we see, Children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as adult males, adult female, and Child
def get_who(who):
    return 'Child' if who.Age2 < 16 else who.Sex
    
alldata['Who'] = alldata[['Age2','Sex']].apply(get_who,axis=1)

# adding a text Class column
class_text = {1: 'First', 2: 'Second', 3: 'Third'}
alldata['Class'] = alldata['Pclass'].map(class_text)

# FIRST-AUTHOR: remove plotting
# g = sns.factorplot(x="Who", y="Survived", col="Class", 
#     data=alldata[alldata['Survived'].notnull()], saturation=.5,
#     kind="bar", ci=None, aspect=.6)
# (g.set_axis_labels("", "Survival Rate")
# .set_xticklabels(["Men", "Women", "Children"])
# .set_titles("{col_name} {col_var}")
# .despine(left=True))  
_ = alldata[alldata['Survived'].notnull()]

# Add Sex column to the drop list since we created Who column
drop_list.append('Sex')

# create dummy variables for Who column
# drop Male as it has the lowest average of survived passengers
who_dummies  = pd.get_dummies(alldata['Who'])
who_dummies.columns = ['Child','Adult_Female','Adult_Male']
who_dummies.drop(['Adult_Male'], axis=1, inplace=True)

alldata = alldata.join(who_dummies.astype(int))

drop_list.append('Class')
drop_list.append('Who')

### 3.2. Fare column

In [11]:
alldata['Fare'] = alldata['Fare'].astype(int)

# get fare for survived & not-survived passengers 
fare_not_survived = alldata[alldata['Survived'] == 0]['Fare']
fare_survived     = alldata[alldata['Survived'] == 1]['Fare']

# get average and std for fare of survived/not survived passengers
#avgerage_fare = pd.DataFrame([fare_not_survived.mean(), fare_survived.mean()])
#std_fare      = pd.DataFrame([fare_not_survived.std(), fare_survived.std()])

# plot
# fig, (axis1,axis2) = plt.subplots(ncols=2, figsize= (10,5))
# FIRST-AUTHOR: remove plotting
# figure1 = plt.figure(figsize=(12,5))
# plt.hist([fare_survived, fare_not_survived], stacked=True, color = ['g','black'], 
#          label = ['Survived','Not Survived'], bins = 25)
# plt.xlabel('Fare')
# plt.ylabel('Number of Passengers')
# plt.legend()


### 3.3. Pclass Column

In [12]:
# FIRST-AUTHOR: remove plotting
# sns.factorplot('Pclass','Survived', order=[1,2,3], kind='bar', ci=False, data=alldata[alldata['Survived'].notnull()])
_ = alldata[alldata['Survived'].notnull()]

# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies  = pd.get_dummies(alldata['Pclass'])
pclass_dummies.columns = ['Class_1','Class_2','Class_3']
pclass_dummies.drop(['Class_3'], axis=1, inplace=True)

drop_list.append('Pclass')

alldata = alldata.join(pclass_dummies.astype(int))


In [13]:
### 3.4. Family Size

In [14]:
# Instead of having two columns Parch & SibSp
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
# So we are going to keep 
alldata['Family'] =  alldata["Parch"] + alldata["SibSp"]

alldata['Family'] = alldata['Family'].apply(lambda x: 1 if x > 0 else 0)

alldata['FamilySize'] =  alldata["Parch"] + alldata["SibSp"] + 1

# plot the survivor count by family size and survivor rate by family size side-by-side
# FIRST-AUTHOR: remove plotting
# fig, (axis1,axis2) = plt.subplots(ncols=2, figsize= (10,5))

# sns.countplot(x="FamilySize", hue="Survived", ax=axis1, data=alldata[alldata['Survived'].notnull()])
# sns.factorplot(x="FamilySize", y="Survived",kind='bar', ax=axis2, ci=False, 
#                 data=alldata[alldata['Survived'].notnull()])
_ = alldata[alldata['Survived'].notnull()]
_ = alldata[alldata['Survived'].notnull()]

# Use Family size (test score = 0.789) vs Family (test score = 0.756)
drop_list.append('Family')

### 3.5. Encode Title column into numbers
It seems that adding this feature does not improve the model

In [15]:
# FIRST-AUTHOR: remove ML code
# le = preprocessing.LabelEncoder()

# alldata['Title2'] = le.fit_transform(alldata['Title'])
alldata['Title2'] = alldata['Title']


### 3.7. Drop variables that will not be used 

In [16]:
drop_list += ['Title', 'Name','Ticket','Embarked', 'SibSp','Parch']
alldata.drop(drop_list, axis=1, inplace=True)

### 3.8. More Data Analysis

In [17]:
corr = alldata[alldata['Survived'].notnull()].drop(['PassengerId'], axis = 1).corr()
corr

  corr = alldata[alldata['Survived'].notnull()].drop(['PassengerId'], axis = 1).corr()


Unnamed: 0,Survived,Fare,Age2,Child,Adult_Female,Class_1,Class_2,FamilySize
Survived,1.0,0.257482,-0.078578,0.136884,0.506562,0.285904,0.093349,0.016639
Fare,0.257482,1.0,0.099002,0.003551,0.191044,0.591693,-0.116346,0.217052
Age2,-0.078578,0.099002,1.0,-0.572913,0.066849,0.337534,0.014698,-0.271249
Child,0.136884,0.003551,-0.572913,1.0,-0.217481,-0.133146,0.009655,0.441518
Adult_Female,0.506562,0.191044,0.066849,-0.217481,1.0,0.144043,0.060483,0.107192
Class_1,0.285904,0.591693,0.337534,-0.133146,0.144043,1.0,-0.288585,-0.046114
Class_2,0.093349,-0.116346,0.014698,0.009655,0.060483,-0.288585,1.0,-0.038594
FamilySize,0.016639,0.217052,-0.271249,0.441518,0.107192,-0.046114,-0.038594,1.0


In [18]:
# FIRST-AUTHOR: remove plotting
# sns.heatmap(corr, 
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values)
_ = corr.columns.values
_ = corr.columns.values

### 3.9. Prepare the training and testing sets

In [19]:
train = alldata[alldata['Survived'].notnull()].copy()
test = alldata[alldata['Survived'].isnull()].copy()

train.drop('PassengerId', axis=1, inplace = True)

X_train = train.drop('Survived', axis=1)
y_train= train['Survived'].astype('int') # when concatting train set and test set, the Survived column changed to float


X_test  = test.drop(['Survived', 'PassengerId'], axis=1).copy()

#### 3.9.1. Feature Scaling
For RF and LR classifiers, scaling does not make any difference. We will skip below step.

In [20]:
# Lets  scale all features to [0, 1] range
# min_max_scaler = preprocessing.MinMaxScaler()
# X_train = X_train / X_train.max()
# X_test = X_test / X_test.max()

## 4. Modeling and Hyperparameter Turning

### 4.1. Random Forests with OOB

In [21]:
# Random Forests with OOB
# FIRST-AUTHOR: remove ML code, plotting
# min_estimators = 30
# max_estimators = 180

# RANDOM_STATE = 2017

# rf_clf = RandomForestClassifier(oob_score=True, warm_start=True, random_state=RANDOM_STATE)
# error_rate = []
# for i in range(min_estimators, max_estimators + 1):
#     rf_clf.set_params(n_estimators=i)
#     rf_clf.fit(X_train, y_train)
    
#     # Record the OOB error for each `n_estimators=i` setting.
#     oob_error = 1 - rf_clf.oob_score_
#     error_rate.append(oob_error)

# label = "RandomForestClassifier, max_features=None"
# # Generate the "OOB error rate" vs. "n_estimators" plot.
# plt.plot(range(min_estimators, max_estimators + 1), error_rate, label=label)

# plt.xlim(min_estimators, max_estimators)
# plt.xlabel("n_estimators")
# plt.ylabel("OOB error rate")
# plt.legend(loc="upper right")
# plt.show()
# rf_clf.score(X_train, y_train)
# We got a test score of 0.75 for random forest which is less that we got from LR (0.79)

#### 4.2.1. Calculate Variable Importance

In [22]:
# We use variable importance to validate our intuition
# And to go back to perform further feature engineering
# or to check code for potential bugs
# FIRST-AUTHOR: remove ML code
# et_clf = ExtraTreesClassifier(n_estimators=120)
# et_clf = et_clf.fit(X_train, y_train)
features = pd.DataFrame()
features['Feature'] = X_train.columns
# FIRST-AUTHOR: remove ML code
# features['Importance'] = et_clf.feature_importances_
features['Importance'] = X_train.columns
features = features.sort_values('Importance',ascending=False)\
                    .reset_index(drop = True)
features

Unnamed: 0,Feature,Importance
0,Title2,Title2
1,Fare,Fare
2,FamilySize,FamilySize
3,Class_2,Class_2
4,Class_1,Class_1
5,Child,Child
6,Age2,Age2
7,Adult_Female,Adult_Female


### 4.3. Logistic Regression
We got the highest score with this learner. We will use this one for the perdiction.

In [23]:
# FIRST-AUTHOR: remove ML code
# log_reg = LogisticRegression()
# log_reg.fit(X_train, y_train)
# log_reg.score(X_train, y_train)
#log_reg.get_params()

#### 4.3.1. Get correlation coefficients

In [24]:
coeff = pd.DataFrame()
coeff['Feature'] = X_train.columns
# FIRST-AUTHOR: remove ML code
# coeff['Coefficient Estimate'] = pd.Series(log_reg.coef_[0])
coeff

Unnamed: 0,Feature
0,Fare
1,Age2
2,Child
3,Adult_Female
4,Class_1
5,Class_2
6,FamilySize
7,Title2


## 5. Making Prediction
We will use the model from logistic regression as it has highest test score.

In [25]:
# rf_clf1 = RandomForestClassifier()
# rf_clf1.set_params(n_estimators=120)
# rf_clf1.fit(X_train, y_train)
    
# y_test = rf_clf1.predict(X_test)

# rf_clf1.score(X_train, y_train)
# FIRST-AUTHOR: remove ML code
# y_test = log_reg.predict(X_test)

In [26]:
# FIRST-AUTHOR: remove ML code
# submission = pd.DataFrame({
#         "PassengerId": test["PassengerId"],
#         "Survived": y_test
#     })
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_train
    })
submission.to_csv('titanic.csv', index=False)

In [27]:
# we will try 3 or 5-fold cross validation with SVM at some later time
# svc_clf = svm.SVC()

# kfold = KFold(n_splits=5)

# [svc_clf.fit(X_train[train], y_train[train]).score(X_train[test], y_train[test])
#         for train, test in k_fold.split(X_train)]
# y_test = svc.predict(X_test)

# svc_clf.score(X_train, y_train)