### Preparation

In [None]:
include("../../src/modeling/embedded_correlation_lib.jl")
include("../../src/modeling/correlation_lib.jl")
include("../../src/modeling/util.jl")
using SQLite
using CSV
using DataFrames: DataFrame, Missing
using Statistics, LinearAlgebra, Logging, Random
using Gen

# Simple rent model
- Columns are ```state, city, zip, rent```
- Joint probability distribution function:

$Pr(state, city, zip, rent) = Pr(state)\,Pr(city|state)\,Pr(zip|city)\,Pr(rent|city)$
- Can be interpreted as a Bayes Net
- Current assumptions:
    - High cardinality
    - Occurrence information has less meaning
    - No ground truth about $dom(X)$ and $P(X)$

### Read and parse data

In [None]:
CAT_ATTRS = [:state, :city]
NUM_ATTRS = [:zip, :rent]
TSV_PATH_PREFIX = "../../data/tsv/20191218_simple_5_oversampling/num_zip/simple_5_os_"
CAT_EMBEDDING_DICT = merge([read_tsv("$(TSV_PATH_PREFIX)$(cat_attr)_meta.tsv",
                            "$(TSV_PATH_PREFIX)$(cat_attr)_vec.tsv")
                            for cat_attr in CAT_ATTRS]...)

db = SQLite.DB()
rent_table = CSV.File("../../data/rent_data/simple_rent_5_per_city.csv") |> SQLite.load!(db, "rent_table")
df = SQLite.Query(db, "SELECT * FROM rent_table") |> DataFrame
emb_df = replace_with_emb(df, CAT_ATTRS, NUM_ATTRS, CAT_EMBEDDING_DICT)
df

### Model with categorical variables
- ```categorical_co_occurrence("Starnberg", rent)``` models $Pr(rent|city)$:
    - It seeks entries with ```Starnberg``` in the given database;
    - It samples rent from probability distribution with mean and variance calculated from entries found;
    - If e.g. only one entry is found, there is no variance. Thus the function requires a hyperparam "minimum variance", or otherwise it computes the whole variance of rent within the database.

In [None]:
@gen function simple_rent_plain_model(df)
    @info "-----------------SIMPLEPLAIN"
    # w/o frequency info
    states = unique(df.state)
    occurrence = [sum(df.state .== s) for s in states]
    probs = LinearAlgebra.normalize(occurrence, 1)
    state = @trace(categorical_named(states, probs), :state => :realization)

    # with frequency info
    # state = uniform_categorical(states)
    @info "$state"

    city = @trace(categorical_co_occurrence(df,
                                            [:state,],
                                            ["categorical"],
                                            :city,
                                            [state],
                                            true), :city)

    @info "$city"

    zip = @trace(categorical_co_occurrence(df,
                                            [:city],
                                            ["categorical"],
                                            :zip,
                                            [city],
                                            true), :zip)
    @info "$zip"

    total_rent = @trace(numerical_co_occurrence(df,
                                                [:city],
                                                ["categorical"],
                                                :rent,
                                                [city],
                                                false,
                                                true), :rent)

    @info "Totally $total_rent"
end;

### Model with embedded categorical variables
- ```embedding_co_occurrence("Starnberg", rent)```:
    - It seeks entries with ```Starnberg``` in the given database;
    - If number of entries found is less than $k$, which is the mandatory size of the neighborhood, the function seeks neighbors of ```Starnberg```:
    $x_{city} = arg\,min_{x_{city}}\,cos\_dist(Starnberg, x_{city})$, $x_{city} \in dom(city)$
    - It samples rent from probability distribution with mean and variance calculated from entries found above.

In [None]:
@gen function simple_rent_emb_model(df, emb_df, emb_dict)
    @info "-----------------SIMPLEEMB"
    # w/o frequency info
    states = unique(df.state)
    occurrence = [sum(df.state .== s) for s in states]
    probs = LinearAlgebra.normalize(occurrence, 1)
    state = @trace(categorical_named(states, probs), :state => :realization)

    # with frequency info
    # state = uniform_categorical(states)
    @info "$state"

    # sample city from neighborhood, this is a hyperparameter...
    city_neighborhood_size = 1
    city = @trace(embedding_co_occurrence(df,
                                            emb_df,
                                            emb_dict,
                                            [:state,],
                                            ["embedding"],
                                            :city,
                                            "categorical",
                                            [state],
                                            city_neighborhood_size), :city)
    @info "$city"

    # neighborhood size == 1 means we want no emb based neighbors
    zip_neighborhood_size = 1
    zip = @trace(embedding_co_occurrence(df,
                                            emb_df,
                                            emb_dict,
                                            [:city],
                                            ["embedding"],
                                            :zip,
                                            "categorical",
                                            [city],
                                            zip_neighborhood_size), :zip)
    @info "$zip"

    rent_neighborhood_size = 5
    total_rent = @trace(embedding_co_occurrence(df,
                                                emb_df,
                                                emb_dict,
                                                [:city],
                                                ["embedding"],
                                                :rent,
                                                "numerical",
                                                [city],
                                                rent_neighborhood_size), :rent)

    @info "Totally $total_rent" #lsr $living_space_rent, chlsr $city_heating_ls_rent"
end;

###  Abnormalities in the dataset
- Each city in $dom(city)$ appears 5 times
- ```Berlin``` occurs once
- ```Frankfurt``` occurs once with rent of FaM
- ```Starnberg``` occurs once with high rent

In [None]:
disable_logging(LogLevel(10))
(emb_traces, emb_scores) = k_most_improbable_neo(5,
                                    df,
                                    df,
                                    vcat(CAT_ATTRS, NUM_ATTRS),
                                    simple_rent_emb_model,
                                    (df, emb_df, CAT_EMBEDDING_DICT))

(plain_traces, plain_scores) = k_most_improbable_neo(5,
                                    df,
                                    df,
                                    vcat(CAT_ATTRS, NUM_ATTRS),
                                    simple_rent_plain_model,
                                    (df, ))
df[1:3, :]

### Result without embeddings:

In [None]:
for (i, trace) in enumerate(plain_traces)
    println("---------------")
    for attr in vcat(CAT_ATTRS, NUM_ATTRS)
        println("$(attr): $(trace[attr => :realization])")
    end
    println(plain_scores[i])
end

### With embeddings:

In [None]:
for (i, trace) in enumerate(emb_traces)
    println("---------------")
    for attr in vcat(CAT_ATTRS, NUM_ATTRS)
        println("$(attr): $(trace[attr => :realization])")
    end
    println(emb_scores[i])
end