In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 
from recbole.config import Config
from recbole.data import create_dataset, data_preparation

from logging import getLogger
from recbole.model.general_recommender import BPR
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

import torch
from recbole.data.interaction import Interaction

import pickle

## Train General Recommendation

In [2]:
# configurations initialization
k = 10

config_dict = {
    #environment settings
    'seed': 1234,
    'reproducibility': True,
    'data_path': '/Users/andreafrasson/Desktop/tesi/code', 
    'topk': k
}

config = Config(model='BPR', dataset='foursquare', config_file_list=['foursquare_general.yaml'], config_dict = config_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)
logger = getLogger()

# write config info into log
logger.info(config)

# dataset creating and filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

16 May 14:44    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 1234
state = INFO
reproducibility = True
data_path = /Users/andreafrasson/Desktop/tesi/code/foursquare
checkpoint_dir =

 saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = uid
ITEM_ID_FIELD = venue_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_len = None
LABEL_FIELD = label
thres

In [3]:
# model loading and initialization
model = BPR(config, train_data.dataset).to(config['device'])
logger.info(model)

16 May 14:44    INFO  BPR(
  (user_embedding): Embedding(1084, 64)
  (item_embedding): Embedding(27899, 64)
  (loss): BPRLoss()
)
Trainable parameters: 1854912


In [4]:
# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

# model evaluation
test_result = trainer.evaluate(test_data)
print(test_result)

16 May 14:44    INFO  epoch 0 training [time: 0.25s, train loss: 17.9465]
16 May 14:44    INFO  epoch 0 evaluating [time: 0.81s, valid_score: 0.290200]
16 May 14:44    INFO  valid result: 
recall@10 : 0.3398    mrr@10 : 0.2902    ndcg@10 : 0.3021    hit@10 : 0.3398    precision@10 : 0.034
16 May 14:44    INFO  Saving current: saved/BPR-May-16-2024_14-44-31.pth
16 May 14:44    INFO  epoch 1 training [time: 0.29s, train loss: 17.6531]
16 May 14:44    INFO  epoch 1 evaluating [time: 0.85s, valid_score: 0.308000]
16 May 14:44    INFO  valid result: 
recall@10 : 0.362    mrr@10 : 0.308    ndcg@10 : 0.3208    hit@10 : 0.362    precision@10 : 0.0362
16 May 14:44    INFO  Saving current: saved/BPR-May-16-2024_14-44-31.pth
16 May 14:44    INFO  epoch 2 training [time: 0.28s, train loss: 17.1217]
16 May 14:44    INFO  epoch 2 evaluating [time: 0.86s, valid_score: 0.318400]
16 May 14:44    INFO  valid result: 
recall@10 : 0.3721    mrr@10 : 0.3184    ndcg@10 : 0.331    hit@10 : 0.3721    precisio

OrderedDict([('recall@10', 0.398), ('mrr@10', 0.3578), ('ndcg@10', 0.3673), ('hit@10', 0.398), ('precision@10', 0.0398)])


## Make prediction

In [5]:
def get_history(inter):
    visits = []
    set_uid = set(inter['uid:token'])
    for u in set_uid:
        visits.append(inter[inter['uid:token'] == u]['venue_id:token'].values.tolist())
    
    return visits

In [29]:
inter = pd.read_csv('inter.csv')

visits = get_history(inter)

In [38]:
#make prediction for users

unique_users = list(set(inter['uid:token']))
unique_locations = list(set(inter['venue_id:token']))


input_inter = Interaction({
    'uid': torch.tensor(unique_users),
    'venue_id': torch.tensor(visits),
})

with torch.no_grad():
    scores = model.full_sort_predict(input_inter).reshape((len(unique_users), len(unique_locations)+1))

print(scores.shape)

torch.Size([1083, 27899])
tensor([[ 0.0197,  0.8055,  0.3794,  ...,  0.0839,  0.0364,  0.0106],
        [-0.0046,  0.1129,  0.0357,  ...,  0.0159,  0.0172, -0.0300],
        [-0.0136,  0.0353,  0.0997,  ..., -0.0488, -0.0621, -0.0810],
        ...,
        [ 0.0136,  0.0411, -0.0996,  ...,  0.0281,  0.0257,  0.0543],
        [ 0.0020, -0.1165,  0.0610,  ..., -0.0912, -0.0512, -0.0758],
        [-0.0322,  0.0715,  0.1625,  ...,  0.3819,  0.4844,  0.3862]])


In [81]:
# get the 10 items with highest scores
rec_list = np.argsort(scores, axis = 1)[:, -k:]

In [149]:
# select one item at random for each user
def random_choice(a):
    return int(np.random.choice(a, 1))

random_item = np.apply_along_axis(random_choice, 1, rec_list)

  return int(np.random.choice(a, 1))


In [150]:
poi = pd.read_csv('foursquare/foursquare.item', sep = '\t')

with open('saved_dictionary.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [151]:
for i in random_item:
    cat = poi[poi['venue_id:token'] == int(i)]['venue_category_name:token'].sample(1).values
    print(cat, loaded_dict[int(cat)])

  print(cat, loaded_dict[int(cat)])


[14] Asian Restaurant
[165] Office
[22] Bar
[3] American Restaurant
[26] Bike Shop
[22] Bar
[159] Music Venue
[54] Coffee Shop
[114] Gym / Fitness Center
[205] Seafood Restaurant
[36] Burger Joint
[97] French Restaurant
[71] Dim Sum Restaurant
[3] American Restaurant
[51] Church
[3] American Restaurant
[169] Paper / Office Supplies Store
[159] Music Venue
[210] Snack Place
[22] Bar
[39] Caf�
[22] Bar
[22] Bar
[22] Bar
[22] Bar
[3] American Restaurant
[39] Caf�
[141] Medical Center
[168] Outdoors & Recreation
[128] Italian Restaurant
[22] Bar
[22] Bar
[224] Sushi Restaurant
[177] Pizza Place
[68] Department Store
[86] Fast Food Restaurant
[234] Thai Restaurant
[141] Medical Center
[128] Italian Restaurant
[22] Bar
[22] Bar
[159] Music Venue
[82] Event Space
[77] Eastern European Restaurant
[22] Bar
[39] Caf�
[50] Chinese Restaurant
[177] Pizza Place
[22] Bar
[22] Bar
[166] Other Great Outdoors
[141] Medical Center
[159] Music Venue
[199] Sandwich Place
[165] Office
[39] Caf�
[165] Offic

## Add new values in the dataset

In [190]:
inter = pd.read_csv('inter.csv')

In [191]:
current_time = max(inter['timestamp:token'])+1

In [192]:
new_locations = pd.DataFrame({'uid:token': unique_users, 'venue_id:token':random_item.tolist(), 'timestamp:token':[current_time]*len(random_item)}, columns=['uid:token', 'venue_id:token', 'timestamp:token'])

In [193]:
inter = pd.concat([inter, new_locations], axis = 0).reset_index(drop = True)
inter.sort_values(by=['uid:token', 'timestamp:token'], inplace=True)

In [195]:
inter

Unnamed: 0,uid:token,venue_id:token,timestamp:token
0,1,33236,0
1,1,791,1
2,1,1511,2
3,1,6720,3
4,1,255,4
...,...,...,...
108296,1083,1313,96
108297,1083,4567,97
108298,1083,4127,98
108299,1083,1313,99
