In [1]:
const name = "all/ItemSimilarity";

In [1]:
using CSV
using DataFrames
using LinearAlgebra
import NBInclude: @nbinclude
using SparseArrays
using StatsBase: cor
using Statistics: mean
@nbinclude("../TrainingAlphas/Alpha.ipynb");

In [2]:
anime_to_uid = DataFrame(CSV.File("../../data/processed_data/anime_to_uid.csv"));
anime = DataFrame(
    CSV.File("../../data/processed_data/anime.csv", ntasks = 1; stringtype = String),
)
anime_to_uid = innerjoin(anime_to_uid, anime, on = "animeid" => "anime_id");

In [3]:
function sparset(df)
    sparse(df.user, df.item, df.rating, num_users(), num_items())
end;

function get_data()
    df = reduce(
        cat,
        [
            get_split(x, y, z; fields = [:user, :item, :timestamp, :rating]) for
            x in ["training", "validation", "test"] for y in ["random", "temporal"] for
            z in ["implicit"]
        ],
    )
    sparse(df), sparset(df)
end;

get_data (generic function with 1 method)

In [4]:
M, Mt = get_data();

(sparse(Int32[16, 452, 2657, 4872, 6504, 6679, 7532, 8491, 8882, 9057  …  17838, 18092, 18610, 19243, 20703, 20872, 21232, 21283, 21643, 22167], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  5452310, 5452310, 5452310, 5452310, 5452310, 5452310, 5452310, 5452310, 5452310, 5452310], Float32[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 24182, 5452310), sparse(Int32[24703, 48418, 57472, 69660, 173308, 182565, 191301, 200983, 206037, 227644  …  2433513, 2547988, 2667024, 3938330, 3950768, 3993419, 4535938, 5100094, 5419981, 5443334], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  24182, 24182, 24182, 24182, 24182, 24182, 24182, 24182, 24182, 24182], Float32[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0  …  1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 5452310, 24182))

In [6]:
W = zeros(Float32, num_items(), num_items());
counts = sum(M, dims = 2)
@tprogress Threads.@threads for i = 1:num_items()
    if counts[i] > 0
        intersection = M * Mt[:, i]
        union = (counts - intersection) .+ counts[i]
        W[:, i] = intersection ./ union
    end
end

[32mProgress: 100%|███████████████████████████| Time: 0:01:53 ( 0.11  s/it)[39m


In [7]:
function get_neighborhood(item, W, K; transpose = false)
    weights = W[:, item]
    order = sortperm(weights, rev = true)[1:K]
    order[1:K], weights[order[1:K]]
end;

In [8]:
function get_series(df)
    df = copy(df)
    df[!, :uid] = df[:, :anime] .- 1
    sim = innerjoin(df, anime_to_uid, on = "uid")
    sort(sim, :weight, rev = true)
end;

In [9]:
function display(df::DataFrame)
    df = select(df, :title, :weight, :uid)
    headers = titlecase.(Base.replace.(names(df), "_" => " "))
    Base.show(
        stdout,
        MIME("text/html"),
        df;
        allow_html_in_cells = true,
        header = headers,
        nosubheader = true,
        row_number_column_title = "Rank",
        top_left_str = "",
    )
end;

In [10]:
function display_related_series(mal_uid)
    uid = only(anime_to_uid[anime_to_uid.animeid.==mal_uid, :uid])
    items, weights = get_neighborhood(uid + 1, W, 100)
    df = DataFrame(anime = items, weight = weights)
    get_series(df) |> display
end;

In [11]:
ENV["DATAFRAMES_COLUMNS"] = 300;
ENV["DATAFRAMES_ROWS"] = 300;

In [12]:
# good series to test: 721, 290, 189, 39535, 82, 12467, 9756, 37521, 49387

In [15]:
display_related_series(39535)

Rank,Title,Weight,Uid
1,Mushoku Tensei: Isekai Ittara Honki Dasu,1.0,12888
2,Mushoku Tensei: Isekai Ittara Honki Dasu Part 2,0.618675,4232
3,Tensei shitara Slime Datta Ken 2nd Season,0.444997,16719
4,Tensei shitara Slime Datta Ken,0.41715,17087
5,Tate no Yuusha no Nariagari,0.407717,10033
6,Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season,0.382462,23922
7,Tensei shitara Slime Datta Ken 2nd Season Part 2,0.37739,22006
8,Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season Part 2,0.370387,22106
9,Horimiya,0.368468,23264
10,"Maou Gakuin no Futekigousha: Shijou Saikyou no Maou no Shiso, Tensei shite Shison-tachi no Gakkou e Kayou",0.359462,944


In [14]:
display_related_series(37521)

[32mProgress: 100%|███████████████████████████| Time: 0:06:09 ( 0.37  s/it)[39m


Rank,Title,Weight,Uid
1,Vinland Saga,1.0,13136
2,Dr. Stone,0.554619,18603
3,Kimetsu no Yaiba,0.547818,20871
4,Shingeki no Kyojin Season 3 Part 2,0.538515,4871
5,Jujutsu Kaisen,0.53395,4154
6,Shingeki no Kyojin Season 3,0.531189,10849
7,Mob Psycho 100 II,0.519075,10165
8,Dororo,0.515613,14754
9,Shingeki no Kyojin Season 2,0.513987,14992
10,Yakusoku no Neverland,0.513228,7316


In [15]:
# 1	Princess Tutu	1.0
# 2	Shoujo Kakumei Utena	0.254906
# 3	Full Moon wo Sagashite	0.228932
# 4	Kuragehime	0.225228
# 5	Mawaru Penguindrum	0.220572
# 6	Cardcaptor Sakura	0.210011
# 7	Tokyo Mew Mew	0.208115
# 8	Shugo Chara!	0.207744
# 9	Shoujo Kakumei Utena: Adolescence Mokushiroku	0.207724
# 10	Kaiba	0.202521
# 11	Haibane Renmei	0.201937
# 12	Hirogaru Sky! Precure	0.200713
# 13	Versailles no Bara	0.196705
# 14	Cardcaptor Sakura Movie 2: Fuuin Sareta Card	0.193909
# 15	Delicious Party♡Precure	0.192223
# 16	Aria the Animation	0.191062
# 17	Kaleido Star	0.190768
# 18	Ojamajo Doremi	0.186161
# 19	Kareshi Kanojo no Jijou	0.182229
# 20	Dennou Coil	0.17748
# 21	Gakuen Alice	0.176189
# 22	Top wo Nerae! Gunbuster	0.176024
# 23	Flip Flappers	0.174974
# 24	Sarazanmai	0.174411
# 25	Sennen Joyuu	0.173716
# 26	Sayonara Zetsubou Sensei	0.17254
# 27	Cardcaptor Sakura Movie 1	0.171731
# 28	Kodomo no Omocha (TV)	0.171682
# 29	Gankutsuou	0.170489
# 30	Jinrui wa Suitai Shimashita	0.169933