# Item Collaborative Filtering
* See `ItemCollaborativeFilteringBase.ipynb` for algorithm details
* We construct a graph $G$ where the vertices are items and edges are relations (sequel, prequel, etc.)
* $w_{ij}$ is the length of the shortest path between items $i,j$ in $G$

In [1]:
name = "ItemCFRelated";

In [2]:
using NBInclude
@nbinclude("ItemCFBase.ipynb");

In [3]:
using DataFrames

## Compute similarity matrices

In [4]:
function get_similarity_matrix(relationships)
    file = "../../data/processed_data/related_series.csv"
    df = DataFrame(CSV.File(file))
    df = filter(x -> x.relationship ∈ relationships, df)
    num_items = maximum(get_split("training").item)
    sparse(
        df.source .+ 1,
        df.target .+ 1,
        fill(1.0, length(df.source)),
        num_items,
        num_items,
    )
end;

In [5]:
function all_pairs_shortest_paths(S)
    # uses the optimization that all edges are weight 1
    dists = fill(Inf, size(S)...)
    for v = 1:size(S)[1]
        dists[v, v] = 0
    end
    T = I(size(S)[1])

    @showprogress for epoch = 1:size(S)[1]
        T = T * S
        change = false
        for (i, j, v) in zip(findnz(T)...)
            if dists[i, j] > epoch
                dists[i, j] = epoch
                change = true
            end
        end
        if !change
            break
        end
    end
    dists
end;

In [6]:
function get_similarity_matrix_outdir(relationships)
    # if the matrix is already stored on disk, return its filepath
    # otherwise, regenerate the matrix and store it to disk
    outdir = "$name/$(hash(relationships))"
    if ispath("../../data/alphas/$outdir")
        return outdir
    end

    @debug "generating similarity matrix for relationships $relationships"
    S = get_similarity_matrix(relationships)
    S = collect(1 ./ all_pairs_shortest_paths(S))
    S = convert.(Float32, S)
    write_params(Dict("S" => S), outdir = outdir)
    outdir
end;

## Setup hyperparameters

In [7]:
all_relations = [
    "side_story",
    "summary",
    "parent_story",
    "sequel",
    "prequel",
    "character",
    "alternative_version",
    "other",
    "spin_off",
    "alternative_setting",
    "full_story",
];

In [8]:
# Used to filter recommendations
# TODO always use strict related
strict_relations = [
    "side_story",
    "summary",
    "parent_story",
    "sequel",
    "prequel",
    "alternative_version",
    "spin_off",
    "alternative_setting",
    "full_story",
]

get_similarity_matrix_outdir(strict_relations);

In [9]:
@memoize function max_neighborhood_size(relation)
    # returns the size of the largest nonzero neighborhood
    S = read_params(get_similarity_matrix_outdir(relation))["S"]
    S = 1 ./ S
    maxK = 0
    for j = 1:size(S)[2]
        K = sum((S[:, j] .!= 0) .* (S[:, j] .!= Inf))
        if K > maxK
            maxK = K
        end
    end
    neighborhood_size = Int(round(maxK))
    @debug "using neighborhood size $neighborhood_size"
    neighborhood_size
end;

In [10]:
downcast_to_int(x) = isinteger(x) ? Int(x) : x
alphas = [
    ["UserItemBiases"]
    ["ItemCF.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["ItemCFResid.$K" for K in downcast_to_int.([2^4, 2^6, 2^8, 2^10])]
    ["MatrixFactorization.$K" for K in downcast_to_int.([10, 20, 40])]
]
params = [
    cf_params(
        name = "ItemCFRelated.$name",
        training_residuals = alphas,
        validation_residuals = alphas,
        neighborhood_type = "abs",
        S = get_similarity_matrix_outdir(relation),
        K = max_neighborhood_size(relation),
        λ = [1.0483368356243072, 0.7720700833331772, 19.305513975900805],
    ) for (relation, name) in zip([strict_relations], ["strict_relations"])
];

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220113 02:03:26 using neighborhood size 106


## Train models

In [11]:
for param in params
    optimize_model(param)
end

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:30[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:04[39m
[32mProgress: 100%|███████████████████████████| Time: 0:00:44 (44.59 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220113 02:07:38 loss: Dual{ForwardDiff.Tag{var"#validation_mse#22"{var"#19#21"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float64}}(1.2006332759222358,-2.005628174542259e-9,-3.803004630376038e-9,-1.7813250575267674e-10) β: Dual{ForwardDiff.Tag{var"#validation_mse#22"{var"#19#21"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float64}}(2.6645278324984916,0.21301474412429802,-0.42163083785225375,0.12241858063563921): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#22"{var"#19#21"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Fl

Iter     Function value   Gradient norm 
     0     1.200633e+00     3.803005e-09
 * Current step size: 1.0
 * time: 0.026339054107666016
 * g(x): [-2.005628174542259e-9, -3.803004630376038e-9, -1.7813250575267674e-10]
 * x: [1.0483368356243072, 0.7720700833331772, 19.305513975900805]


[32mProgress: 100%|███████████████████████████| Time: 0:19:02 ( 1.08  s/it)[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:02:25[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220113 02:31:22 training set: RMSE 0.9963327635221425 MAE 0.7292021808155242 R2 0.013598617791763412
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:18[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220113 02:31:52 validation set: RMSE 1.095734126475139 MAE 0.7912385115863292 R2 0.005462716622757435
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:18[39m
