# Additional experiment
A Method to Improve Serendipity of Recommendation Lists Based on Collaborative Metric Learning (Yoneda2023)

# Import

In [1]:
import sys
import os
from datetime import datetime

root_dir = '../../../'
if root_dir not in sys.path:
    sys.path.append(root_dir)

import torch
from torch import optim
import pandas as pd

pd.set_option('display.max_columns', 100)

from modules import losses, models, samplers, searches, evaluators, trainers, datasets, distributions

# Output settings

In [2]:
out_to_file = True
out_dir = '../../out/comparison2/ml_20m/'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Dataset

In [3]:
dataset = datasets.ML20m()
n_user = dataset.n_user
n_item = dataset.n_item
n_feedback = dataset.n_pos_pairs
train_set, test_set = dataset.get_train_and_test_set(neg_pair_weight=10)

In [4]:
print(f'n_user = {n_user}')
print(f'n_item = {n_item}')
print(f'n_feedback = {n_feedback}')

n_user = 137330
n_item = 20720
n_feedback = 9993294


# Device

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_set = torch.LongTensor(train_set).to(device)
test_set = torch.FloatTensor(test_set).to(device)

In [6]:
print(device)

cuda:0


# Evaluator

In [7]:
# The size of recommendation set (K)
ks = [10]

score_function_dict = {
    "Recall"       : evaluators.recall,
    "Unpopularity" : evaluators.unpopularity,
    "Serendipity"  : evaluators.serendipity,
    "Long-tail rate": evaluators.longtail_rate,
}
userwise = evaluators.UserwiseEvaluator(test_set, score_function_dict, ks)

# Sampler

In [8]:
sampler = samplers.BaseSampler(train_set, n_user, n_item, device=device, strict_negative=False)

# Model

In [9]:
# Hyperparameters
lr = 1e-3
n_dim = 10
n_batch = 256
n_epoch = 50
no_progressbar = True
b_li = [0.2, 0.5, 0.9]

feedback_num_df = dataset.feedback_num_train_data()
model_dict_li = []
for b in b_li:
    model = models.CollaborativeMetricLearning(n_user, n_item, n_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = losses.MinorTripletLoss(feedback_num_df, margin=1, a=1, b=b, device=device).to(device)
    trainer = trainers.BaseTrainer(model, optimizer, criterion, sampler, no_progressbar)
    knn = searches.NearestNeighborhood(model)
    model_dict = {
        "model": model,
        "optimizer": optimizer,
        "criterion": criterion,
        "trainer": trainer,
        "search": knn,
    }
    model_dict_li.append(model_dict)

# Training

In [10]:
for model_dict in model_dict_li:
    model_dict["trainer"].fit(n_batch, n_epoch)

# Result

In [11]:
re_li = []
for model_dict in model_dict_li:
    model_dict["trainer"].valid(model_dict["search"], userwise)
    re = model_dict["trainer"].valid_scores
    re_li.append(re)

In [12]:
for re in re_li:
    if out_to_file:
        now = datetime.now()
        file_name = now.strftime("minor_%Y_%m_%d_%H%M.csv")
        re.to_csv(out_dir + file_name, index=False)
        
    display(re)

Unnamed: 0,Recall@10,Unpopularity@10,Serendipity@10,Long-tail rate@10
0,0.45578,927.854191,2.528851,0.141129


Unnamed: 0,Recall@10,Unpopularity@10,Serendipity@10,Long-tail rate@10
0,0.407139,1128.957548,2.729991,0.203497


Unnamed: 0,Recall@10,Unpopularity@10,Serendipity@10,Long-tail rate@10
0,0.35068,2086.446385,2.325041,0.393344
