### Preparation

In [None]:
cd("C:\\git\\lets_learn\\infclean\\src\\")
include("modeling\\correlation_lib.jl")
include("modeling\\util.jl")
using SQLite
using CSV
using DataFrames: DataFrame, Missing, showall
using Statistics, LinearAlgebra, Logging, Random
using Gen

cd("C:\\git\\lets_learn\\infclean\\data\\mercateo\\")

ATTRIBUTES = [:catalog_id,
            :article_id,
            :destination,
            :lower_bound,
            :ek_amount,
            :vk_amount,
            :currency,
            :unit,
            :tax,
            :set_id]

db = SQLite.DB()
patched_table = CSV.File("patched.csv") |> SQLite.load!(db, "patched_table")
;

In [None]:
patched_df = parse_ek_data(db, :patched_table)
patched_df[1:20, :]

# The EK Model
The idea is similar to the rent model: to build a top-down sampler that incorporates correlation knowledge.

__However, the occurrence statistics do not give much information on correctness, i.e. a catalog_id isn't necessarily incorrect just because it appears sparsely in the observation dataset.__

In [None]:
@gen function ek_model(realization_df)
    @info "-----------------ADV"
    # 1. Pick catalog_id
    catalog_id = @trace(uniformly_categorical(realization_df, :catalog_id), :catalog_id)
    @info "CID: $catalog_id"
    
    # 2. Pick article_id corresponding to observed co_occurrence with catalog_id
    article_id = @trace(co_occurrence(realization_df,
                                            [:catalog_id,],
                                            :article_id,
                                            [String(catalog_id),],
                                            false), :article_id)
    @info "AID: $article_id"

    # 3. Pick destination: We assume that the same product from the same supplier
    # comes from the same country.
    destination = @trace(co_occurrence(realization_df,
                                            [:catalog_id, :article_id],
                                            :destination,
                                            [String(catalog_id), String(article_id)],
                                            false), :destination)
    @info "Destination: $destination"

    # 4. Pick lower bound
    # Chance to pick a lower bound that has been observed together with aid is high.
    # For the same article from the same supplier, higher lower bound should imply
    # lower ek. => Vertical learning
    lower_bound = @trace(co_occurrence(realization_df,
                                            [:catalog_id, :article_id],
                                            :lower_bound,
                                            [String(catalog_id), String(article_id)],
                                            false), :lower_bound)
    @info "LB: $lower_bound"

    # 5. Pick currency
    currency = @trace(co_occurrence(realization_df,
                                            [:destination],
                                            :currency,
                                            [String(destination)],
                                            false), :currency)
    @info "Currency: $currency"

    # 6. Pick unit
    unit = @trace(co_occurrence(realization_df,
                                        [:article_id],
                                        :unit,
                                        [String(article_id)],
                                        false), :unit)
    @info "Unit: $unit"

    # 7. Pick tax
    tax = @trace(co_occurrence(realization_df,
                                    [:article_id, :destination],
                                    :tax,
                                    [String(article_id), String(destination)],
                                    false), :tax)
    @info "Tax: $tax"

    # Pick set_id
    set_id = @trace(co_occurrence(realization_df,
                                        [:catalog_id, :article_id],
                                        :set_id,
                                        [String(catalog_id), String(article_id)],
                                        false), :set_id)
    @info "SID: $set_id"

    # 9. Pick ek
    # ek is heavily correlated with other attributes. Here we use multiple FDs
    # to make ek suggestions and sample final ek, using mean and std of the
    # suggestions
    # TODO replace these FDs with embeddings
    # From article_id
    aid_unit_ek = numerical_functional_dependency(realization_df,
                                            [:article_id, :unit],
                                            :ek_amount,
                                            [String(article_id), String(unit)],
                                            true,
                                            false)

    aid_cid_unit_ek = numerical_functional_dependency(realization_df,
                                            [:article_id, :catalog_id, :unit],
                                            :ek_amount,
                                            [String(article_id), String(catalog_id), String(unit)],
                                            true,
                                            false)
    set_id_unit_ek = numerical_functional_dependency(realization_df,
                                                    [:set_id, :unit],
                                                    :ek_amount,
                                                    [String(set_id), String(unit)],
                                                    true,
                                                    false)
    mean_ek = mean([aid_unit_ek, aid_cid_unit_ek, set_id_unit_ek])
    std_ek = max(std([aid_cid_unit_ek, aid_unit_ek, set_id_unit_ek]), 0.01)
    ek_amount = @trace(half_normal(mean_ek, std_ek), :ek_amount => :realization)
    @info "EK: $ek_amount"
end
;

### Evaluating log-likelihood

In [None]:
disable_logging(LogLevel(-1))

for i = 1:5
    constraints = make_constraints(patched_df[i, :])
    (trace, weight) = Gen.generate(ek_model, (patched_df,), constraints)
    println("Loglikelihood: $weight")
end

### General problems
- Now the dataset is unsorted; Comparing log-likelihoods between e.g. different product categories might be not meaningful, because e.g. the price of smartphones could have a higher variance than printing paper.

    $\to$ Pre-processing the dataset would be meaningful (e.g. partitioning)
    
    
- As occurrence statistics have no real meaning, we may need more information from other places. For e.g. 

    $\to$ Use more columns (keyword, ean, manufacturer etc.)
    
    $\to$ Capture correlation between categorical columns by learning vector representations
    
    
- Set_id is currently giving too much information. The model is relying on it completely.

    $\to$ Replace it with data columns from which set_id is derived.