# Rapid prototyping - Titanic

## Package loading

In [43]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [44]:
%matplotlib inline

In [45]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
gender_sub = pd.read_csv("gender_submission.csv")

## Basic Exploratory Data Analysis

In [123]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked'],
      dtype='object')

In [46]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [47]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [48]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [49]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Rapid Prototyping

The goal with rapid prototyping is to prove that a specific project or concept is possible in the fastest, most efficient way possible. We want to answer the question:


<p style="text-align: center;"><strong>Can we get a working prototype?</strong></p>


If we can't, then we know we don't have to waste effort on an unsolvable problem. If we can solve the problem then we can work on a much deeper analysis.



For the purpose of rapid prototyping, lets impute data in the most simplest way or drop it if we need to.

In [50]:
# Fill the age with the median value

median_age_train = train.Age.median()

# Fill missing Age, forward fill embarked, drop what we may not need for rapid prototyping
train.Age.fillna(median_age_train, inplace=True)
train.Embarked.fillna(method='ffill', inplace=True)
train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

median_age_test = test.Age.median() # set median value

# fill NAN data
test.Age.fillna(median_age_test, inplace=True)
test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

### Deep Feature Synthesis

Part of the goal of a working prototype would be to create features that can help out prototype do good work without too much work or understanding of the domain initially.

While the Titanic problem is simple enough to understand, when confronted with more difficult problems where features aren't well understood. This can be very valuable.

In [51]:
import featuretools as ft

In [60]:
full = train.append(test)
passenger_id=test['PassengerId']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [61]:
# replace missing Fare
full.Fare.fillna(full.Fare.mean(), inplace=True)

# Encode Gender
full['Sex'] = full.Sex.apply(lambda x: 0 if x == "female" else 1)

# Encode Embarked
full['Embarked'] = full['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# replace all other missing with 0
full.fillna(0, inplace=True)



In [63]:
# We create an entity set
es = ft.EntitySet(id = 'titanic')

In [64]:
es = es.entity_from_dataframe(entity_id = 'full', dataframe = full.drop(['Survived'], axis=1), 
                              variable_types = 
                              {
                                  'Embarked': ft.variable_types.Categorical,
                                  'Sex': ft.variable_types.Boolean
                              },
                              index = 'PassengerId')

es

Entityset: titanic
  Entities:
    full [Rows: 1309, Columns: 8]
  Relationships:
    No relationships

In [65]:
es = es.normalize_entity(base_entity_id='full', new_entity_id='Embarked', index='Embarked')
es = es.normalize_entity(base_entity_id='full', new_entity_id='Sex', index='Sex')
es = es.normalize_entity(base_entity_id='full', new_entity_id='Pclass', index='Pclass')
es = es.normalize_entity(base_entity_id='full', new_entity_id='Parch', index='Parch')
es = es.normalize_entity(base_entity_id='full', new_entity_id='SibSp', index='SibSp')
es

Entityset: titanic
  Entities:
    full [Rows: 1309, Columns: 8]
    Embarked [Rows: 3, Columns: 1]
    Sex [Rows: 2, Columns: 1]
    Pclass [Rows: 3, Columns: 1]
    Parch [Rows: 8, Columns: 1]
    SibSp [Rows: 7, Columns: 1]
  Relationships:
    full.Embarked -> Embarked.Embarked
    full.Sex -> Sex.Sex
    full.Pclass -> Pclass.Pclass
    full.Parch -> Parch.Parch
    full.SibSp -> SibSp.SibSp

In [66]:
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation'].head(primitives[primitives['type'] == 'aggregation'].shape[0])

Unnamed: 0,name,type,description
0,num_true,aggregation,Counts the number of `True` values.
1,std,aggregation,"Computes the dispersion relative to the mean value, ignoring `NaN`."
2,sum,aggregation,"Calculates the total addition, ignoring `NaN`."
3,count,aggregation,"Determines the total number of values, excluding `NaN`."
4,num_unique,aggregation,"Determines the number of distinct values, ignoring `NaN` values."
5,skew,aggregation,Computes the extent to which a distribution differs from a normal distribution.
6,time_since_last,aggregation,Calculates the time elapsed since the last datetime (in seconds).
7,time_since_first,aggregation,Calculates the time elapsed since the first datetime (in seconds).
8,max,aggregation,"Calculates the highest value, ignoring `NaN` values."
9,median,aggregation,Determines the middlemost number in a list of values.


In [67]:
primitives[primitives['type'] == 'transform'].head(primitives[primitives['type'] == 'transform'].shape[0])

Unnamed: 0,name,type,description
20,haversine,transform,Calculates the approximate haversine distance between two LatLong
21,multiply_numeric_scalar,transform,Multiply each element in the list by a scalar.
22,less_than_equal_to_scalar,transform,Determines if values are less than or equal to a given scalar.
23,modulo_by_feature,transform,Return the modulo of a scalar by each element in the list.
24,num_characters,transform,Calculates the number of characters in a string.
25,time_since_previous,transform,Compute the time in seconds since the previous instance of an entry.
26,is_null,transform,Determines if a value is null.
27,or,transform,Element-wise logical OR of two lists.
28,latitude,transform,Returns the first tuple value in a list of LatLong tuples.
29,scalar_subtract_numeric_feature,transform,Subtract each value in the list from a given scalar.


In [68]:
features, feature_names = ft.dfs(entityset = es, 
                                 target_entity = 'full', 
                                 max_depth = 2)

In [69]:
feature_names

[<Feature: Age>,
 <Feature: Fare>,
 <Feature: Parch>,
 <Feature: Pclass>,
 <Feature: SibSp>,
 <Feature: Embarked>,
 <Feature: Sex>,
 <Feature: Embarked.SUM(full.Age)>,
 <Feature: Embarked.SUM(full.Fare)>,
 <Feature: Embarked.STD(full.Age)>,
 <Feature: Embarked.STD(full.Fare)>,
 <Feature: Embarked.MAX(full.Age)>,
 <Feature: Embarked.MAX(full.Fare)>,
 <Feature: Embarked.SKEW(full.Age)>,
 <Feature: Embarked.SKEW(full.Fare)>,
 <Feature: Embarked.MIN(full.Age)>,
 <Feature: Embarked.MIN(full.Fare)>,
 <Feature: Embarked.MEAN(full.Age)>,
 <Feature: Embarked.MEAN(full.Fare)>,
 <Feature: Embarked.COUNT(full)>,
 <Feature: Embarked.NUM_UNIQUE(full.Parch)>,
 <Feature: Embarked.NUM_UNIQUE(full.Pclass)>,
 <Feature: Embarked.NUM_UNIQUE(full.SibSp)>,
 <Feature: Embarked.NUM_UNIQUE(full.Sex)>,
 <Feature: Embarked.MODE(full.Parch)>,
 <Feature: Embarked.MODE(full.Pclass)>,
 <Feature: Embarked.MODE(full.SibSp)>,
 <Feature: Embarked.MODE(full.Sex)>,
 <Feature: Sex.SUM(full.Age)>,
 <Feature: Sex.SUM(full.Far

In [70]:
len(feature_names)

112

In a few minutes we've generated a bunch of features that we can use for prototyping our problem

In [75]:
# Threshold for removing correlated variables
threshold = 0.95

# Absolute value correlation matrix
corr_matrix = features.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head(50)

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Embarked,Sex,Embarked.SUM(full.Age),Embarked.SUM(full.Fare),Embarked.STD(full.Age),...,SibSp.MEAN(full.Fare),SibSp.COUNT(full),SibSp.NUM_UNIQUE(full.Parch),SibSp.NUM_UNIQUE(full.Pclass),SibSp.NUM_UNIQUE(full.Embarked),SibSp.NUM_UNIQUE(full.Sex),SibSp.MODE(full.Parch),SibSp.MODE(full.Pclass),SibSp.MODE(full.Embarked),SibSp.MODE(full.Sex)
Age,,0.180519,0.125677,0.380274,0.18892,0.022174,0.052928,0.040441,0.008514,0.045555,...,0.06079957,0.1308523,0.1976589,0.2133987,0.2012332,,0.2379074,,,0.02282128
Fare,,,0.221522,0.558477,0.160224,0.064135,0.185484,0.136867,0.010706,0.193481,...,0.2256391,0.2089606,0.04979847,0.03105973,0.09761043,,0.06134832,,,0.1914642
Parch,,,,0.018322,0.373587,0.096857,0.213125,0.083092,0.102642,0.091228,...,0.3302803,0.3625643,0.05262633,0.265065,0.2781161,,0.2938461,,,0.2488658
Pclass,,,,,0.060832,0.033373,0.124617,0.051522,0.091441,0.280068,...,0.09321064,0.05610448,0.2076503,0.1435907,0.1240303,,0.1488672,,,0.162338
SibSp,,,,,,0.074966,0.109609,0.076507,0.070912,0.032782,...,0.7100906,0.8101948,0.4109176,0.7593949,0.7792276,,0.8217369,,,0.3515147
Embarked,,,,,,,0.124849,0.966496,0.983744,0.604985,...,0.07474154,0.05931944,0.02727147,0.03287548,0.0896155,,0.05740721,,,0.04370091
Sex,,,,,,,,0.123637,0.12074,0.066315,...,0.1925157,0.1773133,0.08506071,0.001654746,0.04743528,,0.0206212,,,0.1868998
Embarked.SUM(full.Age),,,,,,,,,0.904692,0.380337,...,0.05080938,0.04266879,0.06158505,0.05262555,0.1046465,,0.07803594,,,0.01143051
Embarked.SUM(full.Fare),,,,,,,,,,0.738135,...,0.08851772,0.06861361,0.002182977,0.01775324,0.07554246,,0.04069646,,,0.06454272
Embarked.STD(full.Age),,,,,,,,,,,...,0.1116884,0.08137341,0.09277791,0.04479328,0.001724522,,0.03522716,,,0.1220009


In [76]:
collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d features to remove.' % (len(collinear_features)))

There are 48 features to remove.


In [77]:
features_filtered = features.drop(columns = collinear_features)

print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])

The number of features that passed the collinearity threshold:  64


## Rapid XGBoost

In [95]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report


In [78]:
features_positive = features_filtered.loc[:, features_filtered.ge(0).all()]

In [79]:
features_positive

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp,Embarked,Sex,Embarked.STD(full.Age),Embarked.STD(full.Fare),Embarked.NUM_UNIQUE(full.Pclass),...,SibSp.MEAN(full.Age),SibSp.MEAN(full.Fare),SibSp.NUM_UNIQUE(full.Parch),SibSp.NUM_UNIQUE(full.Pclass),SibSp.NUM_UNIQUE(full.Embarked),SibSp.NUM_UNIQUE(full.Sex),SibSp.MODE(full.Parch),SibSp.MODE(full.Pclass),SibSp.MODE(full.Embarked),SibSp.MODE(full.Sex)
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,7.2500,0,3,1,0,1,13.005236,37.076590,3,...,30.643448,48.711300,8,3,3,2,0,3,0,0
2,38.0,71.2833,0,1,1,1,0,13.632262,84.036802,3,...,30.643448,48.711300,8,3,3,2,0,3,0,0
3,26.0,7.9250,0,3,0,0,0,13.005236,37.076590,3,...,30.168810,25.793835,6,3,3,2,0,3,0,1
4,35.0,53.1000,0,1,1,0,0,13.005236,37.076590,3,...,30.643448,48.711300,8,3,3,2,0,3,0,0
5,35.0,8.0500,0,3,0,0,1,13.005236,37.076590,3,...,30.168810,25.793835,6,3,3,2,0,3,0,1
6,28.0,8.4583,0,3,0,2,1,9.991200,14.857148,3,...,30.168810,25.793835,6,3,3,2,0,3,0,1
7,54.0,51.8625,0,1,0,0,1,13.005236,37.076590,3,...,30.168810,25.793835,6,3,3,2,0,3,0,1
8,2.0,21.0750,1,3,3,0,1,13.005236,37.076590,3,...,18.650000,71.332090,3,3,1,2,1,3,0,0
9,27.0,11.1333,2,3,0,0,0,13.005236,37.076590,3,...,30.168810,25.793835,6,3,3,2,0,3,0,1
10,14.0,30.0708,0,2,1,1,0,13.632262,84.036802,3,...,30.643448,48.711300,8,3,3,2,0,3,0,0


In [117]:
train_X = features_positive[:train.shape[0]]
train_y = train['Survived']

test_X = features_positive[train.shape[0]:]

In [118]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [125]:
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.05, random_state=42)
gbm.fit(train_X, train_y)
cross_val_score(gbm,train_X, train_y, scoring='accuracy', cv=10).mean()

0.8294841675178753

In [126]:
gbm_pred = gbm.predict(X_test)

In [128]:
print(classification_report(y_test, gbm_pred))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       105
           1       0.89      0.85      0.87        74

   micro avg       0.89      0.89      0.89       179
   macro avg       0.89      0.89      0.89       179
weighted avg       0.89      0.89      0.89       179



In [97]:
gbm_pred_final = gbm.predict(test_X)

## Submission

In [112]:
sub = test_X.reset_index()

In [113]:
sub['Survived'] = pd.DataFrame(gbm_pred_final)

In [114]:
sub = sub[['PassengerId', 'Survived']]

In [116]:
sub.to_csv('gbm_submission.csv', index=False)