# Titanic dataset classification

## Setup

In [None]:
using Revise
using Titanic
using DataFrames
using Statistics
using StatsBase
using Plots
gr()
theme(:orange)

In [None]:
df = read_csv_data("../data/train.csv")
describe(df)

## Data (pre-)processing

* target variable
    * survived
* ordinal properties(numerical)
    * pclass
    * age
    * sibsp
    * parch
    * fare
* nominal properties:
    * sex
    * name
    * ticket
    * cabin
    * home.dest
    * embarked

### Missing values

how many values do we miss in each column?

In [None]:
Dict(map(col -> (col, (sum(completecases(df,col))/nrow(df), nrow(df) - sum(completecases(df,col)))), names(df)))

replace missing in Age with median

In [None]:
col_median = apply_to_cols(df, :Age, median)
df = replace_in_cols(df, :Age, missing, col_median)

replace missing in Embarked with the most common city

In [None]:
col_city = apply_to_cols(df, :Embarked, most_common)
df = replace_in_cols(df, :Embarked, missing, col_city)

replace missing in Cabin with unknown and strip numbers of cabins leaving only its class

In [None]:
unique(map(entry -> entry[begin],skipmissing(df[!, :Cabin])))

use 'N' for replacement

In [None]:
df = replace_in_cols(df, :Cabin, missing, "N") 
col_cabin = apply_to_cols(df, :Cabin, strip_cabin_numbers)
df[!, :Cabin] = col_cabin
df

### Names to titles

In [None]:
groups = get_title_groups()
df = replace_names_with_title_categories(df, groups)

### Remove columns

In [None]:
df = select(df, Not([:PassengerId, :Ticket]))

### Nominal to Categorical

In [None]:
df = categorize(df)

### Correlation matrix

In [None]:
heatmap(cor(Matrix(df)); ticks=(1:length(names(df)), names(df)), xrotation=30, yflip=true)

### Standartize

In [None]:
survived = df[!, :Survived]
df = standartize(df)
df.Survived = survived
df

## Splitting the dataset

In [None]:
trn, val, tst = random_split(df, [0.6, 0.2, 0.2])

## Models

### K-nn

In [None]:
knn = K_nn(;n=50, metric="l2")
model_fit(knn, trn[!, [:Fare, :Sex]], trn[!, :Survived])
val_X, val_y = val[!, [:Fare, :Sex]], val[!, :Survived]
val_preds = model_predict(knn, val_X)
accuracy(val_y, val_preds)

In [None]:
tst_X, tst_y = tst[!, [:Fare, :Sex]], tst[!, :Survived]
accuracy(model_predict(knn, tst_X), tst_y)

## Final evaluation and submission