# Similarity metrics
* Computes similarity matrics M, where M[i, j] is the similarity between items i, j

In [None]:
medium = "anime"

In [None]:
const name = "$medium/all/SimilarityMetrics";

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

In [None]:
using CSV
using DataFrames
using SparseArrays

## Compute similarity matrices

In [None]:
function sparset(df)
    sparse(df.user, df.item, df.rating, num_users(medium), num_items(medium))
end;

function get_data(recent_years)
    df = reduce(
        cat,
        [
            get_split(x, y, z, medium; fields = [:user, :item, :timestamp, :rating]) for
            x in ["training", "validation", "test"] for y in ALL_TASKS for
            z in ["implicit"]
        ],
    )
    df = filter(df, df.timestamp .>= 1 - year_in_timestamp_units(medium) * recent_years)
    sparse(df), sparset(df)
end;

In [None]:
function get_watch_similarity(; recent_years)
    # intersection / union on sets of watches 
    M, Mt = get_data(recent_years)
    W = zeros(Float32, num_items(medium), num_items(medium))
    counts = sum(M, dims = 2)
    @showprogress for i = 1:num_items(medium)
        if counts[i] > 0
            intersection = M * Mt[:, i]
            union = (counts - intersection) .+ counts[i]
            W[:, i] = intersection ./ union
        end
    end
    W
end;

In [None]:
todense(S) = convert.(Float16, S)
todense(S::AbstractSparseArray) = todense(collect(S))

function save_similarity_matrix(name, S)
    outdir = name
    write_params(Dict("S" => todense(S)), outdir)
end;

In [None]:
watch_sim = get_watch_similarity(recent_years = 5)
save_similarity_matrix("$medium/all/WatchSimilarity", watch_sim)

In [None]:
function get_media(medium::String)
    df = DataFrame(
        CSV.File(
            get_data_path("processed_data/$medium.csv"),
            ntasks = 1;
            stringtype = String,
        ),
    )
    df[:, [Symbol("$(medium)_id"), :genres, :tags]]
end;

In [None]:
function get_content_similarity(col)
    GC.gc()
    media = get_media(medium)
    uids = DataFrame(CSV.File(get_data_path("processed_data/$(medium)_to_uid.csv")))
    media = innerjoin(uids, media, on = "$(medium)id" => "$(medium)_id")
    get_feature_list(df) = Set(split(df[2:end-1], ", "))
    features = [Set() for _ = 1:num_items(medium)]
    @showprogress for i = 1:length(media.uid)
        features[media.uid[i]+1] = get_feature_list(media[:, col][i])
    end
    W = zeros(Float32, num_items(medium), num_items(medium))
    @tprogress Threads.@threads for i = 1:num_items(medium)
        for j = 1:num_items(medium)
            if length(union(features[i], features[j])) != 0
                W[i, j] =
                    length(intersect(features[i], features[j])) /
                    length(union(features[i], features[j]))
            else
                W[i, j] = i == j
            end
        end
    end
    W
end;

In [None]:
save_similarity_matrix("$medium/all/GenreSimilarity", get_content_similarity(:genres))

In [None]:
# tag fields rely on anidb tags, which are only populated for anime
if medium == "anime"
    save_similarity_matrix("$medium/all/TagSimilarity", get_content_similarity(:tags))
end