### Initiate Environment

In [1]:
using Pkg
Pkg.activate(".")

using Revise
using TitanicClassifier
using CSV
using DataFrames

[32m[1m  Activating[22m[39m project at `~/CTU/SEM_5/JUL/TitanicClassifier/examples`


### Data Analysis

In [2]:
data = CSV.read("../data/train.csv", DataFrame);

In [64]:
incomplete_cols = Dict()

for row in eachrow(data)
    for name in names(data)
        if(ismissing(row[name]))
            incomplete_cols[name] = get!(incomplete_cols, name, 0) + 1
        end
    end
end

for entry in incomplete_cols
    print("Missing ", entry[2], " out of ", nrow(data), " entries (", round(entry[2]/nrow(data); sigdigits=2), "%) for column ", entry[1], "\n")
end

Missing 177 out of 891 entries (0.2%) for column Age
Missing 2 out of 891 entries (0.0022%) for column Embarked
Missing 687 out of 891 entries (0.77%) for column Cabin


We need to fill missing columns with some data in order for our model to work. For cabin the number of missing entries
is great and there is no clear method to impute the missing data - it might be possible to infer the cabin data from ticket 
and embarked features, however I chose just to introduce an unknown token for the missing entries. Additionally since one of the sources for cabin data is the recollection of survivors it seems that unknown token holds information for survival inference. Age and embarked feature I will impute from the other features.

### Data Preprocessing

In [37]:
freq = title_frequencies(data)

Dict{Any, Any} with 17 entries:
  "Jonkheer"     => 1
  "Don"          => 1
  "the Countess" => 1
  "Master"       => 40
  "Miss"         => 182
  "Capt"         => 1
  "Mrs"          => 125
  "Rev"          => 6
  "Major"        => 2
  "Mr"           => 517
  "Mme"          => 1
  "Mlle"         => 2
  "Col"          => 2
  "Lady"         => 1
  "Sir"          => 1
  "Ms"           => 1
  "Dr"           => 7

In [13]:
replace_rules = [[["Dr", "Rev", "Col", "Major", "Capt"], "Officer"],
                 [["Jonkheer", "Countess", "Sir", "Lady", "Don"], "Royalty"], 
                 [["Mlle"], "Miss"], [["Ms"], "Miss"], [["Mme"],"Mrs"]]

for row in eachrow(data)
    print(TitanicClassifier.get_title_token(row, replace_rules))
    break
end

Mr

In [39]:
function missing_filter(df)
    for col in df
        if ismissing(col)
            return false
        end
    end
    return true
end

filter(missing_filter, data[1:5,:])

Row,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64?,Int64,Int64,String31,Float64,String15?,String1?
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [114]:
extract_deck_from_cabin(cabin) = cabin[1]

decks = Dict()

id = 1
for row in eachrow(data)
    if !ismissing(row["Cabin"]) && !haskey(decks, extract_deck_from_cabin(row["Cabin"]))
        decks[extract_deck_from_cabin(row["Cabin"])] = id
        id+=1
    end
end
decks['U'] = id
print(decks)

Dict{Any, Any}('B' => 6, 'U' => 9, 'D' => 4, 'A' => 5, 'E' => 2, 'G' => 3, 'T' => 8, 'F' => 7, 'C' => 1)

In [123]:
embarks = Dict()

id = 1
for row in eachrow(data)
    if !ismissing(row["Embarked"]) && !haskey(embarks, row["Embarked"])
        embarks[row["Embarked"]] = id
        id+=1
    end
end
print(embarks)

Dict{Any, Any}(String1("Q") => 3, String1("S") => 1, String1("C") => 2)

In [124]:
tickets = Set()

for row in eachrow(data)
    temp = split(row["Ticket"], " ")
    if (length(temp) == 1 && temp[1]=="LINE")
        push!(tickets, -1)
    else 
        push!(tickets, parse(Int64, last(temp)))
    end
end

sorted_nums = sort(collect(tickets))

ticket_idx = Dict()
for i in 1:length(sorted_nums)
    ticket_idx[sorted_nums[i]] = i
end

print(ticket_idx)

Dict{Any, Any}(315094 => 433, 4135 => 96, 250648 => 415, 3101296 => 670, 370376 => 614, 3337 => 83, 4136 => 97, 4348 => 100, 237671 => 382, 392082 => 634, 17463 => 172, 349249 => 550, 3101272 => 648, 3540 => 90, 113783 => 339, 14263 => 158, 113051 => 323, 244361 => 399, 17610 => 203, 345764 => 466, 219533 => 356, 27849 => 240, 34244 => 275, 34068 => 273, 3101269 => 646, 17318 => 168, 113043 => 321, 239855 => 388, 110813 => 305, 17582 => 187, 330931 => 446, 3101305 => 672, 347083 => 497, 343095 => 459, 343120 => 460, 349224 => 531, 392089 => 637, 345778 => 474, 12460 => 141, 19950 => 221, 315090 => 431, 362316 => 579, 11771 => 136, 367228 => 598, 113510 => 331, 110564 => 304, 211536 => 354, 3381 => 84, 237736 => 383, 315097 => 435, 363291 => 580, 29103 => 254, 244278 => 396, 364499 => 584, 233639 => 370, 230136 => 365, 349253 => 553, 220367 => 357, 345774 => 472, 29751 => 263, 248740 => 409, 1585 => 10, 17764 => 214, 349234 => 537, 383121 => 629, 2690 => 68, 17466 => 175, 348121 => 508,