# Item Collaborative Filtering
* See `ItemCollaborativeFiltering,Base.ipynb` for algorithm details
* We construct a graph $G$ where the vertices are items and edges are relations (sequel, prequel, etc.)
* $w_{ij}$ is the length of the shortest path between items $i,j$ in $G$

In [1]:
name = "ItemCF.Related";

In [2]:
using NBInclude
@nbinclude("ItemCF.Base.ipynb");

In [3]:
using DataFrames

## Compute similarity matrices

In [4]:
function get_similarity_matrix(relationships)
    file = "../../data/processed_data/related_series.csv"
    df = DataFrame(CSV.File(file))
    df = filter(x -> x.relationship ∈ relationships, df)
    num_items = maximum(get_split("training").item)
    sparse(
        df.source .+ 1,
        df.target .+ 1,
        fill(1.0, length(df.source)),
        num_items,
        num_items,
    )
end;

In [5]:
function all_pairs_shortest_paths(S)
    # uses the optimization that all edges are weight 1
    dists = fill(Inf, size(S)...)
    for v = 1:size(S)[1]
        dists[v, v] = 0
    end
    T = I(size(S)[1])

    @showprogress for epoch = 1:size(S)[1]
        T = T * S
        change = false
        for (i, j, v) in zip(findnz(T)...)
            if dists[i, j] > epoch
                dists[i, j] = epoch
                change = true
            end
        end
        if !change
            break
        end
    end
    dists
end;

In [6]:
function get_similarity_matrix_outdir(relationships)
    # if the matrix is already stored on disk, return its filepath
    # otherwise, regenerate the matrix and store it to disk
    outdir = "$name/$(hash(relationships))"
    if ispath("../../data/alphas/$outdir")
        return outdir
    end

    @debug "generating similarity matrix for relationships $relationships"
    S = get_similarity_matrix(relationships)
    S = collect(1 ./ all_pairs_shortest_paths(S))
    S = convert.(Float32, S)
    write_params(Dict("S" => S), outdir = outdir)
    outdir
end;

## Setup hyperparameters

In [7]:
all_relations = [
    "side_story",
    "summary",
    "parent_story",
    "sequel",
    "prequel",
    "character",
    "alternative_version",
    "other",
    "spin_off",
    "alternative_setting",
    "full_story",
];

In [8]:
strict_relations = [
    "side_story",
    "summary",
    "parent_story",
    "sequel",
    "prequel",
    "alternative_version",
    "spin_off",
    "alternative_setting",
    "full_story",
]

get_similarity_matrix_outdir(strict_relations);

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 01:58:02 generating similarity matrix for relationships ["side_story", "summary", "parent_story", "sequel", "prequel", "alternative_version", "spin_off", "alternative_setting", "full_story"]
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:00[39m


In [9]:
@memoize function max_neighborhood_size(relation)
    # returns the size of the largest nonzero neighborhood
    S = read_params(get_similarity_matrix_outdir(relation))["S"]
    S = 1 ./ S
    maxK = 0
    for j = 1:size(S)[2]
        K = sum((S[:, j] .!= 0) .* (S[:, j] .!= Inf))
        if K > maxK
            maxK = K
        end
    end
    neighborhood_size = Int(round(maxK))
    @debug "using neighborhood size $neighborhood_size"
    neighborhood_size
end;

In [10]:
alphas = ["UserItemBiases"]
params = [
    cf_params(
        name = "ItemCF.Related.$name",
        training_residuals = alphas,
        validation_residuals = alphas,
        neighborhood_type = "abs",
        S = get_similarity_matrix_outdir(relation),
        K = max_neighborhood_size(relation),
        λ = [1.0, 1.0, 0.0],
    ) for (relation, name) in zip([strict_relations], ["strict_relations"])
];

[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 01:58:50 using neighborhood size 110


## Train models

In [11]:
for param in params
    optimize_model(param)
end

[32mProgress: 100%|███████████████████████████| Time: 0:00:50 (59.24 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:00:19 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5211096,-0.0020700553,-0.001999576,-0.027935123) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.63870615,-0.006071348,-0.43168324,0.6131715): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, F

Iter     Function value   Gradient norm 
     0     1.521110e+00     2.793512e-02
 * Current step size: 1.0
 * time: 0.025377988815307617
 * g(x): Float32[-0.0020700553, -0.001999576, -0.027935123]
 * x: Float32[1.0, 1.0, 0.0]


[32mProgress: 100%|███████████████████████████| Time: 0:00:59 (69.21 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:01:33 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5203918,-0.0021926416,-0.001909873,-0.023097234) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.65465945,-0.0018985504,-0.44283253,0.5931593): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, 

     1     1.516914e+00     2.501513e-03
 * Current step size: 21.522358
 * time: 288.7065348625183
 * g(x): Float32[0.002169793, 0.002501513, 0.0011772336]
 * x: Float32[1.0445524, 1.0430356, 0.6012297]


[32mProgress: 100%|███████████████████████████| Time: 0:01:00 (70.07 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:06:21 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5168695,0.0016061242,-0.004722,0.00086706184) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.93977225,0.03423253,-0.6158352,0.45713684): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float3

     2     1.516842e+00     1.935337e-03
 * Current step size: 0.6163483
 * time: 435.81960892677307
 * g(x): Float32[0.0018230841, -0.0019353367, 0.0009894115]
 * x: Float32[1.019135, 1.0136931, 0.59015745]


[32mProgress: 100%|███████████████████████████| Time: 0:00:56 (66.27 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:08:45 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5167816,0.0014016968,-0.0014443476,0.00058469875) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.9185162,0.03334869,-0.6225212,0.45282286): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Flo

     3     1.516724e+00     4.776783e-04
 * Current step size: 3.1558614
 * time: 651.0100538730621
 * g(x): Float32[0.00047767832, -0.00037223616, -0.00040354568]
 * x: Float32[0.9471992, 1.0222298, 0.5127935]


[32mProgress: 100%|███████████████████████████| Time: 0:00:57 (67.02 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:12:20 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5167129,0.00031499506,-0.00013886952,-0.0002860257) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.8938899,0.03427021,-0.6138445,0.4520225): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Fl

     4     1.516704e+00     3.078677e-04
 * Current step size: 2.9144764
 * time: 869.15984582901
 * g(x): Float32[-1.3308107f-5, 0.00030786774, -6.466942f-5]
 * x: Float32[0.89885885, 1.0266476, 0.5512179]


[32mProgress: 100%|███████████████████████████| Time: 0:00:58 (68.29 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:16:01 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5167035,-5.374185e-7,6.480271e-6,-1.841609e-6) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.9053785,0.0376239,-0.6225517,0.44599685): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32

     5     1.516703e+00     7.682917e-07
 * Current step size: 1.0240691
 * time: 1084.5161278247833
 * g(x): Float32[-2.335538f-7, -7.6829167f-7, -3.4081071f-7]
 * x: Float32[0.898217, 1.0246882, 0.5556753]


[32mProgress: 100%|███████████████████████████| Time: 0:00:59 (68.89 ms/it)[39m
[38;5;4m[1m[ [22m[39m[38;5;4m[1mDebug: [22m[39m20220225 02:19:36 loss: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(1.5167035,4.6920197e-9,-2.7230298e-9,-4.3699617e-9) β: Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}}(0.90545976,0.037632715,-0.6225681,0.44599313): λ ForwardDiff.Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, Float32}, Float32, 3}[Dual{ForwardDiff.Tag{var"#validation_mse#23"{var"#20#22"{cf_params, Dict{String, typeof(get_abs_neighborhood)}, Int64, Matrix{Float32}}, RatingsDataset, RatingsDataset}, F

     6     1.516703e+00     6.026536e-09
 * Current step size: 1.0038815
 * time: 1294.6676819324493
 * g(x): Float32[6.0265357f-9, -3.0624443f-9, -3.4481427f-9]
 * x: Float32[0.8982284, 1.0246936, 0.55569655]


[32mProgress: 100%|███████████████████████████| Time: 1:05:39 ( 3.35  s/it)[39m
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220225 03:33:12 training set: RMSE 1.1863704023226231 MAE 0.8649476243687343 R2 0.1456223726621052
[38;5;6m[1m[ [22m[39m[38;5;6m[1mInfo: [22m[39m20220225 03:33:22 validation set: RMSE 1.2315451510337065 MAE 0.8972142383674298 R2 0.12971873901347897
