In [1]:
name = "../../data/recommendations/ItemSimilarity"
residual_alphas = ["UserItemBiases"];

In [2]:
using NBInclude
@nbinclude("../TrainingAlphas/Alpha.ipynb");

In [3]:
using SparseArrays
using DataFrames

In [4]:
anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"));
anime = DataFrame(CSV.File("../../data/raw_data/anime.csv"));
anime_to_uid = innerjoin(anime_to_uid, anime, on = "anime_id");

## Rating correlation

In [5]:
@memoize function get_norms(R)
    norms = map(norm, eachslice(R, dims = 2))
    norms[norms.==0] .= 1 # prevent division by 0
    norms
end;

function get_correlation_neighborhood(item, R, K)
    norms = get_norms(R)
    weights = vec(R[:, item]' * R) ./ norms ./ norms[item]
    order = sortperm(weights, rev = true)[1:K]
    order[1:K], weights[order[1:K]]
end

const training = get_residuals("training", residual_alphas);
const validation = get_residuals("validation", residual_alphas);
const test = get_residuals("test", residual_alphas);
const allsplits = RatingsDataset(
    [training.user; validation.user; test.user],
    [training.item; validation.item; test.item],
    [training.rating; validation.rating; test.rating],
)

R = sparse(
    allsplits.user,
    allsplits.item,
    allsplits.rating,
    maximum(allsplits.user),
    maximum(allsplits.item),
);

## Embedding distance

In [14]:
function get_embedding_neighborhood(item, kernel, K, p)
    dists = map(x -> norm(x, p), eachslice(kernel .- kernel[:, item], dims = 2))
    weights = -dists
    order = sortperm(dists)[1:K]
    order[1:K], weights[order[1:K]]
end

function normalize_rows(X)
    σ = std(X, dims = 2)
    μ = mean(X, dims = 2)
    (X .- μ) ./ σ
end

item_embeddings = normalize_rows(
    vcat(
        [
            collect(read_params("MatrixFactorization.10")["A"]')
            collect(read_params("MatrixFactorization.20")["A"]')
            collect(read_params("MatrixFactorization.40")["A"]')
        ],
    ),
);

# Compute Neighborhoods

In [21]:
uid = only(anime_to_uid[anime_to_uid.anime_id.==721, :uid]);

In [22]:
items, weights = get_correlation_neighborhood(uid + 1, R, 30);
corr_df = DataFrame(anime = items, weight = weights);

In [23]:
items, weights = get_embedding_neighborhood(uid + 1, item_embeddings, 30, 2);
embed_df = DataFrame(anime = items, weight = weights);

# Display Similar Series

In [24]:
function get_series(df)
    df = copy(df)
    df[!, :uid] = df[:, :anime] .- 1
    sim = innerjoin(df, anime_to_uid, on = "uid")
    sort(sim, :weight, rev = true)
end;

In [25]:
get_series(corr_df)

Unnamed: 0_level_0,anime,weight,uid,anime_id,title,medium
Unnamed: 0_level_1,Int64,Float64,Int64,Int64,String,String7
1,714,1.0,713,721,Princess Tutu,tv
2,437,0.144456,436,440,Shoujo Kakumei Utena,tv
3,386,0.13854,385,387,Haibane Renmei,tv
4,337,0.122711,336,338,Versailles no Bara,tv
5,1021,0.118006,1020,1033,Sennen Joyuu,movie
6,3640,0.113799,3639,3701,Kaiba,tv
7,2147,0.112829,2146,2164,Dennou Coil,tv
8,7393,0.10689,7392,7785,Yojouhan Shinwa Taikei,tv
9,3252,0.101058,3251,3297,Aria the Origination,tv
10,950,0.0946853,949,962,Aria the Natural,tv


In [26]:
get_series(embed_df)

Unnamed: 0_level_0,anime,weight,uid,anime_id,title
Unnamed: 0_level_1,Int64,Float64,Int64,Int64,String
1,714,-0.0,713,721,Princess Tutu
2,2147,-10.5154,2146,2164,Dennou Coil
3,337,-12.7728,336,338,Versailles no Bara
4,452,-13.6509,451,455,Fantastic Children
5,477,-13.8412,476,95,Turn A Gundam
6,81,-13.9177,80,82,Mobile Suit Gundam 0080: War in the Pocket
7,866,-14.3207,865,878,Zegapain
8,786,-14.5065,785,795,Oniisama e...
9,2385,-14.508,2384,2402,Ashita no Joe
10,328,-14.565,327,329,Planetes
