In [1]:
#pip install kaggle --upgrade

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai import *
from fastai.tabular import *
#import kaggle
from pathlib import Path
import pandas as pd

In [4]:
path = Path('data')
path.mkdir(parents=True, exist_ok=True)

In [5]:
#! kaggle competitions download -c titanic -p {path}

# Load data #
If data is allready preprocessed: skip to __Prepare DataFrame__

In [6]:
df_train = pd.read_csv(path/'train.csv')

df_test = pd.read_csv(path/'test.csv')
test_id = df_test['PassengerId'] # Save PassengerIds for submission

Join train and validation set for simpler prepocess.

In [7]:
df_train['is_test'] = False
df_test['is_test'] = True

df = pd.concat([df_train, df_test], sort = False)

In [8]:
print(df_train.shape, df_test.shape, df.shape)
df.tail()

(891, 13) (418, 12) (1309, 13)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S,True
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,True
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S,True
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S,True
417,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C,True


## Preprocess  ##
Some of the preprocessing is copied from __Mathias Thorsen__ (https://www.kaggle.com/phithor/titanic-using-fast-ai-tabular), especially the __title__ & the __age__.

Only the last name & the title _might_ be relevant.

In [9]:
df['LName'] = df['Name'].apply(lambda name: name.split(',')[0])
df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

Cluster titles

In [10]:
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty",
    "Dona":"Dona",
}

df['Title'] = df.Title.map(Title_Dictionary)

Only keep classes that might have relevance.

In [11]:
df.drop(['Name','Ticket','PassengerId'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,is_test,LName,Title
0,0.0,3,male,22.0,1,0,7.25,,S,False,Braund,Mr
1,1.0,1,female,38.0,1,0,71.2833,C85,C,False,Cumings,Mrs
2,1.0,3,female,26.0,0,0,7.925,,S,False,Heikkinen,Miss
3,1.0,1,female,35.0,1,0,53.1,C123,S,False,Futrelle,Mrs
4,0.0,3,male,35.0,0,0,8.05,,S,False,Allen,Mr


## Check for missings ##

In [13]:
df.isnull().sum()

Survived     418
Pclass         0
Sex            0
Age          263
SibSp          0
Parch          0
Fare           1
Cabin       1014
Embarked       2
is_test        0
LName          0
Title          0
dtype: int64

Cluster the cabins, only keep the first letter.

In [14]:
df['Cabin'] = df['Cabin'].fillna(value = 'N')
df['Cabin'] = df['Cabin'].apply(lambda x: x[0])

'Fare' has no missing in the train set, but in the test set, so it has to be filled manually.

In [15]:
df['Fare'] = df['Fare'].fillna(value = df['Fare'].mean())

Fill the __Age__ by subgroups.

In [16]:
df_grouped = df.groupby(['Sex','Pclass','Title']).mean()['Age']
df_grouped = df_grouped.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

def fill_Age(row):
    condition = ((df_grouped['Sex'] == row['Sex']) &
                 (df_grouped['Pclass'] == row['Pclass']) &
                 (df_grouped['Title'] == row['Title'])
                )
    return df_grouped[condition]['Age'].values[0]

In [17]:
df['Age'] = df.apply(lambda x: fill_Age(x) if np.isnan(x['Age']) else x['Age'], axis = 1)

## Check for missings ##
All other missings can be handled by the fastai DataLoader

In [18]:
df.isnull().sum()

Survived    418
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin         0
Embarked      2
is_test       0
LName         0
Title         0
dtype: int64

In [19]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,is_test,LName,Title
0,0.0,3,male,22.0,1,0,7.25,N,S,False,Braund,Mr
1,1.0,1,female,38.0,1,0,71.2833,C,C,False,Cumings,Mrs
2,1.0,3,female,26.0,0,0,7.925,N,S,False,Heikkinen,Miss
3,1.0,1,female,35.0,1,0,53.1,C,S,False,Futrelle,Mrs
4,0.0,3,male,35.0,0,0,8.05,N,S,False,Allen,Mr


#### Save

In [20]:
df.to_csv(path /'titanic_preproc.csv', index = False)

# Prepare Dataframe #

Split prepocessed dataframe from its 'is_test' column.

In [21]:
df = pd.read_csv(path /'titanic_preproc.csv')

In [22]:
## rmv
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,is_test,LName,Title
0,0.0,3,male,22.0,1,0,7.25,N,S,False,Braund,Mr
1,1.0,1,female,38.0,1,0,71.2833,C,C,False,Cumings,Mrs
2,1.0,3,female,26.0,0,0,7.925,N,S,False,Heikkinen,Miss
3,1.0,1,female,35.0,1,0,53.1,C,S,False,Futrelle,Mrs
4,0.0,3,male,35.0,0,0,8.05,N,S,False,Allen,Mr


In [23]:
df_train = df[df['is_test'] == False].drop('is_test', axis = 1)
df_test = df[df['is_test'] == True].drop(['is_test','Survived'], axis = 1)

In [24]:
df_train['Survived'] = df_train['Survived'].astype(int)

In [25]:
dep_var = 'Survived' # target variable
cat_names = ['LName','Pclass','Sex','SibSp','Parch','Cabin','Title']
cont_names = ['Age','Fare']
procs = [FillMissing, Categorify, Normalize]
t_procs = [partial(FillMissing,test=True), partial(Categorify,test=True),partial(Normalize,test=True)]

In [26]:
test = TabularList.from_df(df_test, path='.', cat_names=cat_names, cont_names=cont_names, procs = t_procs)

In [27]:
data = (TabularList.from_df(df_train, path = path, cat_names=cat_names, cont_names=cont_names, procs = procs)
        .split_by_rand_pct(0) # Force the DataBunch to learn on all data. Hyperparameters come from a random search.
        .label_from_df(cols = dep_var)
        .add_test(test)
        .databunch())

In [28]:
data.show_batch(rows = 15)

LName,Pclass,Sex,SibSp,Parch,Cabin,Title,Age,Fare,target
Vovk,3,male,0,0,N,Mr,-0.55,-0.4892,0
Tobin,3,male,0,0,F,Mr,-0.0827,-0.4921,0
Porter,1,male,0,0,C,Mr,1.2989,0.3984,0
Wick,1,female,0,2,C,Miss,0.1156,2.6696,1
Sutehall,3,male,0,0,N,Mr,-0.3281,-0.5062,0
Klaber,1,male,0,0,C,Mr,0.8885,-0.1138,0
Sage,3,male,8,2,N,Mr,-0.0827,0.7515,0
Newell,1,male,0,2,D,Mr,2.1124,1.6314,0
Lobb,3,male,1,0,N,Mr,0.0416,-0.3241,0
McNamee,3,male,1,0,N,Mr,-0.4021,-0.3241,0


# Learner #

### Hyperparameters ###
These were found by a random search ([Titanic_TabularLearner_RndSearch](Titanic_TabularLearner-RndSearch.ipynb)).

In [29]:
y_range = [0, 1.2]

lr = 0.097620
wd = 0.014
emb_drop = 0.04
lyrs = [100,200,50]
p = 0.3
opt_func = optim.Adam

learn = tabular_learner(data,
                        layers = lyrs,
                        metrics = accuracy,
                        ps = p,
                        emb_drop = emb_drop,
                        y_range = y_range,
                        opt_func = opt_func
                       )

learn.fit_one_cycle(10, slice(lr), wd)

epoch,train_loss,valid_loss,accuracy,time
0,0.598469,#na#,00:22,
1,0.53444,#na#,00:21,
2,0.503344,#na#,00:21,
3,0.468726,#na#,00:21,
4,0.446109,#na#,00:22,
5,0.427383,#na#,00:21,
6,0.40892,#na#,00:21,
7,0.395605,#na#,00:21,
8,0.380642,#na#,00:21,
9,0.368834,#na#,00:22,


## Predict the labels of the test set.

In [30]:
predictions, _ = learn.get_preds(DatasetType.Test)
labels = np.argmax(predictions, 1)

Save the predicted labels and submit to kaggle

In [31]:
test_id = pd.read_csv(path / 'test.csv')['PassengerId'] # Save PassengerIds for submission

submission = pd.DataFrame({'PassengerId': test_id, 'Survived': labels})
submission.to_csv(path / 'submission_rndSearch_all.csv', index = False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [32]:
#! kaggle competitions submit -c titanic -f submission_rndSearch_all.csv -m "fastai TabularLearner, hyperprams by random search, train on all data"

## Results: ##
88% __accuracy__ on the validation set (20% of the data). 

81.81% __accuracy__ in the competition.

__Rank__: 542 (as of May2020); __Top 2.8%__