### Initiate Environment

In [70]:
using Pkg
Pkg.activate(".")

using Revise
using TitanicClassifier
using CSV
using DataFrames
using Statistics

[32m[1m  Activating[22m[39m project at `~/CTU/SEM_5/JUL/TitanicClassifier/examples`


### Data Analysis

In [2]:
data = CSV.read("../data/train.csv", DataFrame);

In [64]:
incomplete_cols = Dict()

for row in eachrow(data)
    for name in names(data)
        if(ismissing(row[name]))
            incomplete_cols[name] = get!(incomplete_cols, name, 0) + 1
        end
    end
end

for entry in incomplete_cols
    print("Missing ", entry[2], " out of ", nrow(data), " entries (", round(entry[2]/nrow(data); sigdigits=2), "%) for column ", entry[1], "\n")
end

Missing 177 out of 891 entries (0.2%) for column Age
Missing 2 out of 891 entries (0.0022%) for column Embarked
Missing 687 out of 891 entries (0.77%) for column Cabin


We need to fill missing columns with some data in order for our model to work. For cabin the number of missing entries
is great and there is no clear method to impute the missing data - it might be possible to infer the cabin data from ticket 
and embarked features, however I chose just to introduce an unknown token for the missing entries. Additionally since one of the sources for cabin data is the recollection of survivors it seems that unknown token holds information for survival inference. Age and embarked feature I will impute from the other features.

### Data Preprocessing

In [9]:
freq = title_frequencies(data)

Dict{Any, Any} with 17 entries:
  "Jonkheer"     => 1
  "Don"          => 1
  "the Countess" => 1
  "Master"       => 40
  "Miss"         => 182
  "Capt"         => 1
  "Mrs"          => 125
  "Rev"          => 6
  "Major"        => 2
  "Mr"           => 517
  "Mme"          => 1
  "Mlle"         => 2
  "Col"          => 2
  "Lady"         => 1
  "Sir"          => 1
  "Ms"           => 1
  "Dr"           => 7

In [4]:
replace_rules = [[["Dr", "Rev", "Col", "Major", "Capt"], "Officer"],
                 [["Jonkheer", "Countess", "Sir", "Lady", "Don"], "Royalty"], 
                 [["Mlle"], "Miss"], [["Ms"], "Miss"], [["Mme"],"Mrs"]]

Mr

In [6]:
extract_deck_from_cabin(cabin) = cabin[1]

decks = Dict()

id = 1
for row in eachrow(data)
    if !ismissing(row["Cabin"]) && !haskey(decks, extract_deck_from_cabin(row["Cabin"]))
        decks[extract_deck_from_cabin(row["Cabin"])] = id
        id+=1
    end
end
decks['U'] = id
print(decks)

Dict{Any, Any}('B' => 6, 'U' => 9, 'D' => 4, 'A' => 5, 'E' => 2, 'G' => 3, 'T' => 8, 'F' => 7, 'C' => 1)

In [7]:
embarks = Dict()

id = 1
for row in eachrow(data)
    if !ismissing(row["Embarked"]) && !haskey(embarks, row["Embarked"])
        embarks[row["Embarked"]] = id
        id+=1
    end
end
print(embarks)

Dict{Any, Any}(

String1("Q") => 3, String1("S") => 1, String1("C") => 2)

In [8]:
tickets = Set()

function extract_ticket_num(ticket) 
    temp = split(ticket, " ")
    if (length(temp) == 1 && temp[1]=="LINE")
        return -1
    else 
        return parse(Int64, last(temp))
    end
end

for row in eachrow(data)
    push!(tickets, extract_ticket_num(row["Ticket"]))
end

sorted_nums = sort(collect(tickets))

ticket_idx = Dict()
for i in 1:length(sorted_nums)
    ticket_idx[sorted_nums[i]] = i
end

In [18]:

title_tokens = Dict()

i = 0

replace_rules = [[["Dr", "Rev", "Col", "Major", "Capt"], "Officer"],
[["Jonkheer", "Countess", "Sir", "Lady", "Don"], "Royalty"], 
[["Mlle"], "Miss"], [["Ms"], "Miss"], [["Mme"],"Mrs"]]

for (title, value) in freq
    if !haskey(title_tokens, get_title_token(title, replace_rules))
        if(get_title_token(title, replace_rules)=="Unknown")
            print(title)
        end
        title_tokens[get_title_token(title, replace_rules)] = i
        i+=1
    end
end

print(title_tokens)

Dict{Any, Any}(

"Miss" => 2, "Master" => 1, "Officer" => 3, "Royalty" => 0, "Mrs" => 4, "Mr" => 5)

In [28]:
processed_data = copy(data)

enumerate_sex(str) = str == "male" ? 0 : 1

transform!(processed_data, :Sex => ByRow(sex -> enumerate_sex(sex)) => :Sex)
transform!(processed_data, :Cabin => ByRow(cabin -> ismissing(cabin) ? decks['U'] : decks[extract_deck_from_cabin(cabin)]) => :Cabin)
transform!(processed_data, :Ticket => ByRow(ticket -> ticket_idx[extract_ticket_num(ticket)]) => :Ticket)
transform!(processed_data, :Embarked => ByRow(embarked -> ismissing(embarked) ? missing : embarks[embarked]) => :Embarked)
transform!(processed_data, :Name => ByRow(name -> title_tokens[get_title_token(name, replace_rules)]) => :Name)


select!(processed_data, Not("PassengerId"))

print(first(processed_data, 5))



[1m5×11 DataFrame[0m
[1m Row [0m│[1m Survived [0m[1m Pclass [0m[1m Name  [0m[1m Sex   [0m[1m Age      [0m[1m SibSp [0m[1m Parch [0m[1m Ticket [0m[1m Fare    [0m[1m Cabin [0m[1m Embarked [0m
     │[90m Int64    [0m[90m Int64  [0m[90m Int64 [0m[90m Int64 [0m[90m Float64? [0m[90m Int64 [0m[90m Int64 [0m[90m Int64  [0m[90m Float64 [0m[90m Int64 [0m[90m Int64?   [0m
─────┼──────────────────────────────────────────────────────────────────────────────────────────
   1 │        0       3      5      0      22.0      1      0     227   7.25        9         1
   2 │        1       1      4      1      38.0      1      0     195  71.2833      1         2
   3 │        1       3      2      1      26.0      0      0     658   7.925       9         1
   4 │        1       1      4      1      35.0      1      0     350  53.1         1         1
   5 │        0       3      5      0      35.0      0      0     620   8.05        9         1

In [80]:
missing_filter(col_name) = df -> !ismissing(df[col_name])

filtered = filter(missing_filter("Embarked"), filter(missing_filter("Age"), processed_data))

display(DataFrame(cor(Matrix(filtered)), ["Survived", "Pclass", "Title", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]))

Row,Survived,Pclass,Title,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,1.0,-0.356462,-0.426484,0.536762,-0.0824459,-0.015523,0.0952653,-0.182137,0.2661,-0.279494,0.108517
2,-0.356462,1.0,0.0863682,-0.150826,-0.365902,0.0651871,0.0236661,0.371343,-0.552893,0.691132,-0.108502
3,-0.426484,0.0863682,1.0,-0.600639,0.386344,-0.315565,-0.274225,0.0310607,-0.149062,0.0578926,-0.137506
4,0.536762,-0.150826,-0.600639,1.0,-0.0990372,0.106296,0.249543,-0.0897378,0.182457,-0.135285,0.0971287
5,-0.0824459,-0.365902,0.386344,-0.0990372,1.0,-0.307351,-0.187896,-0.122777,0.0931425,-0.245705,0.0121857
6,-0.015523,0.0651871,-0.315565,0.106296,-0.307351,1.0,0.383338,0.0277467,0.13986,0.00548101,0.00402065
7,0.0952653,0.0236661,-0.274225,0.249543,-0.187896,0.383338,1.0,-0.0128203,0.206624,-0.0230722,-0.0140824
8,-0.182137,0.371343,0.0310607,-0.0897378,-0.122777,0.0277467,-0.0128203,1.0,-0.215444,0.227769,-0.155978
9,0.2661,-0.552893,-0.149062,0.182457,0.0931425,0.13986,0.206624,-0.215444,1.0,-0.458929,0.176859
10,-0.279494,0.691132,0.0578926,-0.135285,-0.245705,0.00548101,-0.0230722,0.227769,-0.458929,1.0,-0.117895


Here we can see that Embarked feature doesn't correlate particularly strongly with any of our modified features, therefore I will impute it simply by replacing missing values with the most common value. On the other hand age correlates moderately strongly with
Title, Pclass, Sibsp and Cabin. Of these I will use Title and Pclass to impute Age since these are the two strongest correlated
features with Age.