# Similarity metrics
* Computes similarity matrics M, where M[i, j] is the similarity between items i, j

In [None]:
const name = "all/SimilarityMetrics";

In [None]:
import NBInclude: @nbinclude
@nbinclude("../Alpha.ipynb");

In [None]:
using CSV 
using DataFrames
using SparseArrays

## Compute similarity matrices

In [None]:
function sparset(df)
    sparse(df.user, df.item, df.rating, num_users(), num_items())
end;

function get_data(recent_years)
    df = reduce(
        cat,
        [
            get_split(x, y, z; fields = [:user, :item, :timestamp, :rating]) for
            x in ["training", "validation", "test"] for y in ["random", "temporal"] for
            z in ["implicit"]
        ],
    )
    df = filter(df, df.timestamp .>= 1 - year_in_timestamp_units() * recent_years)
    sparse(df), sparset(df)
end;

In [None]:
function get_watch_similarity(; recent_years)
    # intersection / union on sets of watches 
    M, Mt = get_data(recent_years)
    W = zeros(Float32, num_items(), num_items())
    counts = sum(M, dims = 2)
    @tprogress Threads.@threads for i = 1:num_items()
        if counts[i] > 0
            intersection = M * Mt[:, i]
            union = (counts - intersection) .+ counts[i]
            W[:, i] = intersection ./ union
        end
    end
    W
end;

In [None]:
function save_similarity_matrix(name, S)
    outdir = name
    S = convert.(Float32, collect(S))
    write_params(Dict("S" => S), outdir)
end;

In [None]:
function nonparametric(W)
    W = copy(W)
    @tprogress Threads.@threads for i = 1:size(W)[2]
        W[:, i] = invperm(sortperm(W[:, i], rev = true))
    end
    W = 1 ./ W
    0.5 * (W + W')
end;

In [None]:
watch_sim = get_watch_similarity(recent_years = 5)
save_similarity_matrix("all/WatchSimilarity", watch_sim)
save_similarity_matrix("all/WatchSimilarityNonparametric", nonparametric(watch_sim))

In [None]:
function get_anime()
    df = DataFrame(
        CSV.File(
            get_data_path("processed_data/anime.csv"),
            ntasks = 1;
            stringtype = String,
        ),
    )
    df[:, [:anime_id, :genres, :tags]]
end;

In [None]:
function get_content_similarity(col)
    anime = get_anime()
    uids = DataFrame(CSV.File(get_data_path("processed_data/anime_to_uid.csv")))
    anime = innerjoin(uids, anime, on = "animeid" => "anime_id")
    get_feature_list(df) = Set(split(df[2:end-1], ","))
    features = [Set() for _ = 1:num_items()]
    @tprogress Threads.@threads for i = 1:length(anime.uid)
        features[anime.uid[i]+1] = get_feature_list(anime[:, col][i])
    end
    W = zeros(Float32, num_items(), num_items())
    @tprogress Threads.@threads for i = 1:num_items()
        for j = 1:num_items()
            W[i, j] =
                length(intersect(features[i], features[j])) /
                length(union(features[i], features[j]))
        end
    end
    W
end;

In [None]:
save_similarity_matrix("all/GenreSimilarity", get_content_similarity(:genres))

In [None]:
save_similarity_matrix("all/TagSimilarity", get_content_similarity(:tags))