# Titanic dataset classification

## Setup

In [1]:
using Revise
using Titanic
using DataFrames
using Statistics
using StatsBase
using StatsPlots
using Flux
using Plots

In [2]:
gr()
theme(:lime)

In [3]:
df = read_csv_data("../data/train.csv")
describe(df)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,PassengerId,446.0,1,446.0,891,0,Int64
2,Survived,0.383838,0,0.0,1,0,Int64
3,Pclass,2.30864,1,3.0,3,0,Int64
4,Name,,"Abbing, Mr. Anthony",,"van Melkebeke, Mr. Philemon",0,String
5,Sex,,female,,male,0,String7
6,Age,29.6991,0.42,28.0,80.0,177,"Union{Missing, Float64}"
7,SibSp,0.523008,0,0.0,8,0,Int64
8,Parch,0.381594,0,0.0,6,0,Int64
9,Ticket,,110152,,WE/P 5735,0,String31
10,Fare,32.2042,0.0,14.4542,512.329,0,Float64


## Data (pre-)processing

* target variable
    * survived
* ordinal properties(numerical)
    * pclass
    * age
    * sibsp
    * parch
    * fare
* nominal properties:
    * sex
    * name
    * ticket
    * cabin
    * embarked

### Couple of graphs

In [None]:
@df df histogram(:Age, bins = :scott, fillalpha = 0.4)

In [None]:
gr(size = (600, 500))
@df df corrplot([:Survived :Fare], grid = false)

### Missing values

how many values do we miss in each column?

In [None]:
Dict(map(col -> (col, (count(ismissing, df[!, col])/ nrow(df), count(ismissing, df[!, col]))), names(df)))

replace missing in Age with median

In [4]:
col_median = apply_to_cols(df, :Age, median)
df = replace_in_cols(df, :Age, missing, col_median)

Row,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64,Int64,Int64,String31,Float64,String15,String1
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,missing,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,missing,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,missing,S
6,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,missing,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,missing,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,missing,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,missing,C


replace missing in Embarked with the most common city

In [5]:
col_city = apply_to_cols(df, :Embarked, most_common)
df = replace_in_cols(df, :Embarked, missing, col_city)

Row,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64,Int64,Int64,String31,Float64,String15,String1
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,missing,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,missing,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,missing,S
6,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,missing,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,missing,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,missing,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,missing,C


replace missing in Cabin with unknown and strip numbers of cabins leaving only its class

In [6]:
unique(map(entry -> entry[begin],skipmissing(df[!, :Cabin])))

8-element Vector{Char}:
 'C': ASCII/Unicode U+0043 (category Lu: Letter, uppercase)
 'E': ASCII/Unicode U+0045 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'D': ASCII/Unicode U+0044 (category Lu: Letter, uppercase)
 'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'T': ASCII/Unicode U+0054 (category Lu: Letter, uppercase)

use 'N' for replacement

In [7]:
df = replace_in_cols(df, :Cabin, missing, "N") 
col_cabin = apply_to_cols(df, :Cabin, strip_cabin_numbers)
df[!, :Cabin] = col_cabin
df

Row,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64,Int64,Int64,String31,Float64,Char,String1
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,N,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,N,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,N,S
6,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,N,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,N,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,N,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,N,C


### Names to titles

In [8]:
groups = get_title_groups()
df = replace_names_with_title_categories(df, groups)

Row,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64,Int64,Int64,String31,Float64,Char,String1
1,1,0,3,D,male,22.0,1,0,A/5 21171,7.25,N,S
2,2,1,1,D,female,38.0,1,0,PC 17599,71.2833,C,C
3,3,1,3,D,female,26.0,0,0,STON/O2. 3101282,7.925,N,S
4,4,1,1,D,female,35.0,1,0,113803,53.1,C,S
5,5,0,3,D,male,35.0,0,0,373450,8.05,N,S
6,6,0,3,D,male,28.0,0,0,330877,8.4583,N,Q
7,7,0,1,D,male,54.0,0,0,17463,51.8625,E,S
8,8,0,3,R,male,2.0,3,1,349909,21.075,N,S
9,9,1,3,D,female,27.0,0,2,347742,11.1333,N,S
10,10,1,2,D,female,14.0,1,0,237736,30.0708,N,C


### Remove columns

In [9]:
df = select(df, Not([:PassengerId, :Ticket]))

Row,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,String,String7,Float64,Int64,Int64,Float64,Char,String1
1,0,3,D,male,22.0,1,0,7.25,N,S
2,1,1,D,female,38.0,1,0,71.2833,C,C
3,1,3,D,female,26.0,0,0,7.925,N,S
4,1,1,D,female,35.0,1,0,53.1,C,S
5,0,3,D,male,35.0,0,0,8.05,N,S
6,0,3,D,male,28.0,0,0,8.4583,N,Q
7,0,1,D,male,54.0,0,0,51.8625,E,S
8,0,3,R,male,2.0,3,1,21.075,N,S
9,1,3,D,female,27.0,0,2,11.1333,N,S
10,1,2,D,female,14.0,1,0,30.0708,N,C


### Nominal to Categorical

In [10]:
df = categorize(df)

Row,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Float64,Int64,Int64,Float64,Int64,Int64
1,0,3,0,0,22.0,1,0,7.25,0,0
2,1,1,0,1,38.0,1,0,71.2833,1,1
3,1,3,0,1,26.0,0,0,7.925,0,0
4,1,1,0,1,35.0,1,0,53.1,1,0
5,0,3,0,0,35.0,0,0,8.05,0,0
6,0,3,0,0,28.0,0,0,8.4583,0,2
7,0,1,0,0,54.0,0,0,51.8625,2,0
8,0,3,1,0,2.0,3,1,21.075,0,0
9,1,3,0,1,27.0,0,2,11.1333,0,0
10,1,2,0,1,14.0,1,0,30.0708,0,1


### To Onehot

In [None]:
df = to_onehot(df, [:Name, :SibSp, :Parch, :Cabin]; remove_original=true)

### Corr plot

In [None]:
gr(size = (1000, 1000))
@df df corrplot([:Survived :Fare :Age :Sex :Cabin], grid = false)

### Correlation matrix

In [None]:
heatmap(cor(Matrix(df)); ticks=(1:length(names(df)), names(df)), xrotation=30, yflip=true)

### Standartize

In [11]:
survived = df[!, :Survived]
df = standartize(df)
df.Survived = survived
df

Row,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,0,0.826913,-0.259015,-0.737281,-0.565419,0.43255,-0.473408,-0.502163,-0.447043,-0.568518
2,1,-1.56523,-0.259015,1.35481,0.663488,0.43255,-0.473408,0.786404,0.103877,1.00462
3,1,0.826913,-0.259015,1.35481,-0.258192,-0.474279,-0.473408,-0.48858,-0.447043,-0.568518
4,1,-1.56523,-0.259015,1.35481,0.433068,0.43255,-0.473408,0.420494,0.103877,-0.568518
5,0,0.826913,-0.259015,-0.737281,0.433068,-0.474279,-0.473408,-0.486064,-0.447043,-0.568518
6,0,0.826913,-0.259015,-0.737281,-0.104579,-0.474279,-0.473408,-0.477848,-0.447043,2.57775
7,0,-1.56523,-0.259015,-0.737281,1.8924,-0.474279,-0.473408,0.395591,0.654798,-0.568518
8,0,0.826913,2.59015,-0.737281,-2.10155,2.24621,0.767199,-0.223957,-0.447043,-0.568518
9,1,0.826913,-0.259015,1.35481,-0.181385,-0.474279,2.00781,-0.424018,-0.447043,-0.568518
10,1,-0.369158,-0.259015,1.35481,-1.17987,0.43255,-0.473408,-0.0429314,-0.447043,1.00462


## Splitting the dataset

In [12]:
trn, val, tst = random_split(df, [0.6, 0.2, 0.2])

3-element Vector{Any}:
 [1m534×10 DataFrame[0m
[1m Row [0m│[1m Survived [0m[1m Pclass    [0m[1m Name      [0m[1m Sex       [0m[1m Age       [0m[1m SibSp     [0m[1m Parch [0m ⋯
     │[90m Int64    [0m[90m Float64   [0m[90m Float64   [0m[90m Float64   [0m[90m Float64   [0m[90m Float64   [0m[90m Float6[0m ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │        0   0.826913  -0.259015  -0.737281  -0.411805  -0.474279  -0.473 ⋯
   2 │        0   0.826913  -0.259015   1.35481   -0.719032   0.43255   -0.473
   3 │        0   0.826913  -0.259015  -0.737281  -0.104579  -0.474279  -0.473
   4 │        0   0.826913  -0.259015  -0.737281  -0.565419  -0.474279  -0.473
   5 │        0   0.826913  -0.259015  -0.737281  -0.104579  -0.474279  -0.473 ⋯
   6 │        1  -1.56523   -0.259015   1.35481    1.66198    0.43255   -0.473
   7 │        1  -1.56523   -0.259015   1.35481   -0.872646   1.33938    2.007
   8 │        0  -0.3691

## Models

### K-nn

In [None]:
survived = df[!, :Survived]
df = standartize(df)
df.Survived = survived
df

In [None]:
knn = K_nn(;n=5, metric=Titanic.l2)
model_fit!(knn, trn[!, [:Fare, :Sex]], trn[!, :Survived])
val_X, val_y = val[!, [:Fare, :Sex]], val[!, :Survived]
val_preds = model_predict(knn, val_X)
accuracy(val_y, val_preds)

In [None]:
tst_X, tst_y = tst[!, [:Fare, :Sex]], tst[!, :Survived]
accuracy(model_predict(knn, tst_X), tst_y)

### Logistic regression

In [None]:
logreg = Log_reg()
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
model_fit!(logreg, trn_X, trn_y)
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]
accuracy(model_predict(logreg, val_X), val_y)

In [None]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(logreg, tst_X), tst_y)

### NN

In [45]:
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]

args = Args(lr = 0.0005, batchsize=16, epochs=30, ratios=[0.8, 0.2])

m = Chain(Dense(length(names(trn_X)), 32, relu),
          Dense(32, 32, relu),
          Dense(32, 32, sigmoid),
          Dense(32, 32, relu),
          Dense(32, 32, sigmoid),
          Dense(32, 16, sigmoid),
          Dense(16, 8, relu),
          Dense(8, 2))
nn = Neural_network(;args = args, m=m)
model_fit!(nn, trn_X, trn_y, val_X, val_y; verbose = false)
accuracy(model_predict(nn, val_X), val_y)

0.8258426966292135

In [46]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(nn, tst_X), tst_y)

0.848314606741573

### Decision tree

In [47]:
dt = Decision_tree(max_depth = 5; criterion=entropy_local)

Decision_tree(5, Titanic.entropy_local, nothing)

In [48]:
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
model_fit!(dt, trn_X, trn_y)
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]
accuracy(model_predict(dt, val_X), val_y)

0.7752808988764045

In [49]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(dt, tst_X), tst_y)

0.797752808988764