# Titanic dataset classification

## Setup

In [None]:
using Pkg
Pkg.activate(".")

In [None]:
using Titanic
using DataFrames
using Statistics
using StatsBase
using StatsPlots
using Flux
using Plots

In [None]:
gr()
theme(:lime)

In [None]:
df = read_csv_data("../data/train.csv")
describe(df)

## Data (pre-)processing

* target variable
    * survived
* ordinal properties(numerical)
    * pclass
    * age
    * sibsp
    * parch
    * fare
* nominal properties:
    * sex
    * name
    * ticket
    * cabin
    * embarked

### Couple of graphs

In [None]:
@df df histogram(:Age, bins = :scott, fillalpha = 0.4)

In [None]:
gr(size = (600, 500))
@df df corrplot([:Survived :Fare], grid = false)

### Missing values

how many values do we miss in each column?

In [None]:
Dict(map(col -> (col, (count(ismissing, df[!, col])/ nrow(df), count(ismissing, df[!, col]))), names(df)))

replace missing in Age with median

In [None]:
col_median = apply_to_cols(df, :Age, median)
df = replace_in_cols(df, :Age, missing, col_median)

replace missing in Embarked with the most common city

In [None]:
col_city = apply_to_cols(df, :Embarked, most_common)
df = replace_in_cols(df, :Embarked, missing, col_city)

replace missing in Cabin with unknown and strip numbers of cabins leaving only its class

In [None]:
unique(map(entry -> entry[begin],skipmissing(df[!, :Cabin])))

use 'N' for replacement

In [None]:
df = replace_in_cols(df, :Cabin, missing, "N") 
col_cabin = apply_to_cols(df, :Cabin, strip_cabin_numbers)
df[!, :Cabin] = col_cabin
df

### Names to titles

In [None]:
groups = get_title_groups()
df = replace_names_with_title_categories(df, groups)

### Remove columns

In [None]:
df = select(df, Not([:PassengerId, :Ticket]))

### Nominal to Categorical

In [None]:
df = categorize(df)

### To Onehot

In [None]:
df = to_onehot(df, [:Name, :SibSp, :Parch, :Cabin]; remove_original=true)

### Corr plot

In [None]:
gr(size = (1000, 1000))
@df df corrplot([:Survived :Fare :Age :Sex :Cabin], grid = false)

### Correlation matrix

In [None]:
heatmap(cor(Matrix(df)); ticks=(1:length(names(df)), names(df)), xrotation=30, yflip=true)

### Standartize

In [None]:
survived = df[!, :Survived]
df = standartize(df)
df.Survived = survived
df

## Splitting the dataset

In [None]:
trn, val, tst = random_split(df, [0.6, 0.2, 0.2])

## Models

### K-nn

In [None]:
survived = df[!, :Survived]
df = standartize(df)
df.Survived = survived
df

In [None]:
knn = K_nn(;n=5, metric=Titanic.l2)
model_fit!(knn, trn[!, [:Fare, :Sex]], trn[!, :Survived])
val_X, val_y = val[!, [:Fare, :Sex]], val[!, :Survived]
val_preds = model_predict(knn, val_X)
accuracy(val_y, val_preds)

In [None]:
tst_X, tst_y = tst[!, [:Fare, :Sex]], tst[!, :Survived]
accuracy(model_predict(knn, tst_X), tst_y)

### Logistic regression

In [None]:
logreg = Log_reg()
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
model_fit!(logreg, trn_X, trn_y)
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]
accuracy(model_predict(logreg, val_X), val_y)

In [None]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(logreg, tst_X), tst_y)

### NN

In [None]:
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]

args = Args(lr = 0.0005, batchsize=16, epochs=30, ratios=[0.8, 0.2])

m = Chain(Dense(length(names(trn_X)), 32, relu),
          Dense(32, 32, relu),
          Dense(32, 32, sigmoid),
          Dense(32, 32, relu),
          Dense(32, 32, sigmoid),
          Dense(32, 16, sigmoid),
          Dense(16, 8, relu),
          Dense(8, 2))
nn = Neural_network(;args = args, m=m)
model_fit!(nn, trn_X, trn_y, val_X, val_y; verbose = false)
accuracy(model_predict(nn, val_X), val_y)

In [None]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(nn, tst_X), tst_y)

### Decision tree

In [None]:
dt = Decision_tree(max_depth = 5; criterion=entropy_local)

In [None]:
trn_X, trn_y = trn[!, Not(:Survived)], trn[!, :Survived]
model_fit!(dt, trn_X, trn_y)
val_X, val_y = val[!, Not(:Survived)], val[!, :Survived]
accuracy(model_predict(dt, val_X), val_y)

In [None]:
tst_X, tst_y = tst[!, Not(:Survived)], tst[!, :Survived]
accuracy(model_predict(dt, tst_X), tst_y)