In [1]:
name = "../../data/recommendations/ItemSimilarity"
residual_alphas = ["UserItemBiases"];

In [2]:
using NBInclude
@nbinclude("../TrainingAlphas/Alpha.ipynb");

In [3]:
using SparseArrays
using DataFrames

In [4]:
anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"));
anime = DataFrame(CSV.File("../../data/raw_data/anime.csv", ntasks = 1));
anime_to_uid = innerjoin(anime_to_uid, anime, on = "anime_id");

## Rating correlation

In [5]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

function get_correlation_neighborhood(item, R, K)
    norms = get_norms(R)
    weights = vec(R[:, item]' * R) ./ norms ./ norms[item]
    order = sortperm(weights, rev = true)[1:K]
    order[1:K], weights[order[1:K]]
end

const training = get_residuals("training", residual_alphas);
const validation = get_residuals("validation", residual_alphas);
const test = get_residuals("test", residual_alphas);
const allsplits = RatingsDataset(
    [training.user; validation.user; test.user],
    [training.item; validation.item; test.item],
    [training.rating; validation.rating; test.rating],
)

R = sparse(
    allsplits.user,
    allsplits.item,
    allsplits.rating,
    maximum(allsplits.user),
    maximum(allsplits.item),
);

## Embedding distance

In [6]:
function get_embedding_neighborhood(item, kernel, K, p)
    dists = map(x -> norm(x, p), eachslice(kernel .- kernel[:, item], dims = 2))
    weights = -dists
    order = sortperm(dists)[1:K]
    order[1:K], weights[order[1:K]]
end

function normalize_rows(X)
    σ = std(X, dims = 2)
    μ = mean(X, dims = 2)
    (X .- μ) ./ σ
end

item_embeddings = normalize_rows(
    vcat(
        [
            collect(read_params("MatrixFactorization.10")["A"]')
            collect(read_params("MatrixFactorization.20")["A"]')
            collect(read_params("MatrixFactorization.40")["A"]')
        ],
    ),
);

# Compute Neighborhoods

In [7]:
uid = only(anime_to_uid[anime_to_uid.anime_id.==43691, :uid])

9939

In [8]:
items, weights = get_correlation_neighborhood(uid + 1, R, 30);
corr_df = DataFrame(anime = items, weight = weights);

In [9]:
items, weights = get_embedding_neighborhood(uid + 1, item_embeddings, 30, 2);
embed_df = DataFrame(anime = items, weight = weights);

# Display Similar Series

In [10]:
function get_series(df)
    df = copy(df)
    df[!, :uid] = df[:, :anime] .- 1
    sim = innerjoin(df, anime_to_uid, on = "uid")
    sort(sim, :weight, rev = true)
end;

In [11]:
ENV["LINES"] = 1024;

In [12]:
get_series(corr_df)

Unnamed: 0_level_0,anime,weight,uid,anime_id,title
Unnamed: 0_level_1,Int64,Float64,Int64,Int64,String
1,9940,1.0,9939,43691,Kageki Shoujo!!
2,11003,0.0968749,11002,46093,Shiroi Suna no Aquatope
3,7504,0.094958,7503,40685,Super Cub
4,7653,0.09163,7652,40870,SSSS.Dynazenon
5,9694,0.0881485,9693,43439,Shadows House
6,7916,0.0864715,7915,41169,Love Live! Superstar!!
7,4947,0.0716348,4946,37890,Oshi ga Budoukan Ittekuretara Shinu
8,7456,0.0704129,7455,40620,Uramichi Oniisan
9,9314,0.0701826,9313,42941,Uma Musume: Pretty Derby (TV) Season 2
10,6814,0.0687792,6813,39893,Muteking the Dancing Hero


In [13]:
get_series(embed_df)

Unnamed: 0_level_0,anime,weight,uid,anime_id,title
Unnamed: 0_level_1,Int64,Float64,Int64,Int64,String
1,9940,-0.0,9939,43691,Kageki Shoujo!!
2,4947,-8.01535,4946,37890,Oshi ga Budoukan Ittekuretara Shinu
3,1577,-8.03271,1576,1592,Hataraki Man
4,7359,-8.06529,7358,40513,Nami yo Kiitekure
5,9694,-8.20601,9693,43439,Shadows House
6,2985,-8.2532,2984,35777,Rilakkuma to Kaoru-san
7,5231,-8.3438,5230,38192,Sakugan
8,794,-8.43175,793,798,Yomigaeru Sora: Rescue Wings
9,15606,-8.53415,15605,29831,Tamayura: Sotsugyou Shashin Part 4 - Ashita
10,2018,-8.54144,2017,2035,Eikoku Koi Monogatari Emma: Molders-hen
