# Similarity metrics
* Computes similarity matrics M, where M[i, j] is the similarity between items i, j

In [None]:
import NBInclude: @nbinclude
@nbinclude("../TrainingAlpha.ipynb");

In [None]:
version = ""
dataset = ""
medium = ""
metric = ""

In [None]:
import SparseArrays

## Compute similarity matrices

In [None]:
function get_data(dataset, medium, metric, recent_years)
    df = as_metric(
        get_split(dataset, "train", medium, [:userid, :itemid, :status, :updated_at]),
        metric,
    )
    df = subset(df, df.updated_at .>= 1 - get_timestamp(Dates.Day(365)) * recent_years)
    M = SparseArrays.sparse(
        df.itemid,
        df.userid,
        df.metric,
        num_items(medium),
        num_users(dataset),
    )
    Mt = SparseArrays.sparse(
        df.userid,
        df.itemid,
        df.metric,
        num_users(dataset),
        num_items(medium),
    )
    M, Mt
end;

In [None]:
function get_watch_similarity(dataset, medium, metric, recent_years)
    # intersection / union on sets of watches 
    M, Mt = get_data(dataset, medium, metric, recent_years)
    W = zeros(Float32, num_items(medium), num_items(medium))
    counts = sum(M, dims = 2)
    @showprogress for i = 1:num_items(medium)
        if counts[i] > 0
            intersection = M * Mt[:, i]
            union = (counts - intersection) .+ counts[i]
            W[:, i] = intersection ./ union
        end
    end
    W
end;

In [None]:
function prune_entries!(M::Matrix, max_nonzero::Integer)
    @showprogress Threads.@threads for i = 1:size(M)[2]
        order = sortperm(M[:, i])
        M[order[1:end-max_nonzero], i] .= 0
    end
end;

In [None]:
W = get_watch_similarity(dataset, medium, metric, 10)
prune_entries!(W, 1000) # to reduce the storage cost
W = convert.(Float16, collect(W))
write_params(Dict("S" => W), "nondirectional/$version/$dataset/$medium/$metric/similarity");