# Titanic Dataset
Exercise 4 of Chapter 2 is to build a classifier for the
[Titanic Kaggle problem](https://www.kaggle.com/c/titanic).

The first step is to download the data:

In [0]:
# Installing the Kaggle API

!pip install -U -q kaggle
!mkdir -p /root/.kaggle

In [0]:
# Uploading the installed files using google.colab module from
# my machine.

from google.colab import files

files.upload()

In [0]:
# Copying the API key.

!cp kaggle.json ~/.kaggle/

In [0]:
!chmod 600 ~/.kaggle/kaggle.json

In [14]:
!kaggle competitions list

ref                                            deadline             category            reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
digit-recognizer                               2030-01-01 00:00:00  Getting Started  Knowledge       2462           False  
titanic                                        2030-01-01 00:00:00  Getting Started  Knowledge      10016            True  
house-prices-advanced-regression-techniques    2030-01-01 00:00:00  Getting Started  Knowledge       4066           False  
imagenet-object-localization-challenge         2029-12-31 07:00:00  Research         Knowledge         35           False  
competitive-data-science-predict-future-sales  2019-12-31 23:59:00  Playground           Kudos       2450           False  
two-sigma-financial-news                       2019-07-15 23:59:00  Featured          $100,000       2927           False  
LANL-Ear

In [15]:
!kaggle competitions files -c titanic

name                   size  creationDate         
---------------------  ----  -------------------  
train.csv              60KB  2013-06-28 13:40:25  
test.csv               28KB  2013-06-28 13:40:24  
gender_submission.csv   3KB  2017-02-01 01:49:18  


In [16]:
# Download the data into the Colab.

!kaggle competitions download  -c titanic -p /content/kaggle

Downloading train.csv to /content/kaggle
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 24.3MB/s]
Downloading test.csv to /content/kaggle
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 25.9MB/s]
Downloading gender_submission.csv to /content/kaggle
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.51MB/s]


In [137]:
import pandas as pd

df = pd.read_csv('./kaggle/train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
# Class for selecting features from the DataFrame.

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
  def __init__(self, attribute_names):
    self.attribute_names = attribute_names
        
  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    return X[self.attribute_names].values

In [0]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [0]:
# Preparing the dataset with some pipelines.

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

X = df.drop('Survived', axis=1)
y_train = df['Survived']

num_attribs = ['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = ['Pclass', 'Sex', 'Embarked', 'Cabin']
str_attribs = ['Name', 'Ticket']

# Transformer for numerical attributes
num_pipeline = Pipeline([
    ('num_attrib_selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('cat_attrib_selector', DataFrameSelector(cat_attribs)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(categories='auto', sparse=False)),
])

feature_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

X_train = feature_pipeline.fit_transform(X)

In [150]:
# First trying Stochastic Gradient Descent, which gets 70%-85% accuracy.

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

cross_val_score(SGDClassifier(max_iter=1e3, tol=1e-3), X_train, y_train, cv=5)

array([0.7150838 , 0.70949721, 0.75280899, 0.78089888, 0.70621469])

In [151]:
# Trying a RandomForestClassifier, a more complex model,
# it appears to improve the accuracy, suggesting that
# the previous model was underfitting slightly.

from sklearn.ensemble import RandomForestClassifier

cross_val_score(
    RandomForestClassifier(n_estimators=100, max_features=5), X_train, y_train, cv=5)

array([0.74301676, 0.7877095 , 0.85393258, 0.83707865, 0.85875706])

In [166]:
# Grid searching using RandomForestClassifier. The best model
# gets 81%-82% accuracy, as good as the author's solution.


from sklearn.model_selection import GridSearchCV

param_grid = [{
    'n_estimators': [10, 100],
    'max_features': [5, 10, 15],
}]

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_features': 10, 'n_estimators': 100}
0.813692480359147
