In [3]:
import pandas as pd
import numpy as np
from recbole.config import Config
from recbole.data import create_dataset, data_preparation

from logging import getLogger
from recbole.model.general_recommender import BPR
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

import torch
from recbole.data.interaction import Interaction

import os

In [4]:
from sklearn.preprocessing import LabelEncoder
import pickle

# build the first atomic files from the foursquare dataset
def preprocess():
    foursquare = pd.read_csv('data/foursquare_complete.csv')
    foursquare['geometry'] = list(zip(foursquare['lat'], foursquare['lon']))

    # encoding category

    enc = LabelEncoder()
    enc.fit(foursquare['venue_category_name'])
    foursquare['venue_category_name'] = enc.transform(foursquare['venue_category_name'])

    mapping = dict(zip(range(len(enc.classes_)), enc.classes_))
    with open('name_category.pkl', 'wb') as f:
        pickle.dump(mapping, f)


    #equal length in all of the trajectory
    min_len = float('inf')
    set_uid = set(foursquare['uid'])
    for u in set_uid:
        min_len = min(min_len, len(foursquare[foursquare['uid'] == u]))

    new_df = []
    for u in set_uid:
        to_append = foursquare[foursquare['uid'] == u].iloc[:min_len, :].values.tolist()
        for r in to_append:
            new_df.append(r)

    new_df = pd.DataFrame(new_df, columns=foursquare.columns)

    #timestamp
    new_df['timestamp'] = np.arange(0, 100).tolist() * len(set_uid)

    #inter file for recbole
    red_df = new_df[['uid', 'venue_id', 'timestamp', 'venue_category_name']].copy()
    red_df.columns = ('uid:token', 'venue_id:token', 'timestamp:token', 'venue_category_name:token')

    # encoding ids
    enc = LabelEncoder()
    enc.fit(red_df['venue_id:token'])
    red_df['venue_id:token'] = enc.transform(red_df['venue_id:token'])

    mapping = dict(zip(range(len(enc.classes_)), enc.classes_))
    with open('id_category.pkl', 'wb') as f:
        pickle.dump(mapping, f)

    # interaction file
    red_df[['uid:token', 'venue_id:token', 'timestamp:token']].to_csv('foursquare/foursquare.inter', index=False, sep = '\t')

    #item file for recbole
    items = red_df[['venue_id:token', 'venue_category_name:token']].drop_duplicates()
    items.to_csv('foursquare/foursquare.item', index = False, sep='\t')

    #user file
    pd.DataFrame(set(red_df['uid:token']), columns=['uid:token']).to_csv('foursquare/foursquare.user', index=False, sep = '\t')

In [7]:
# training the RS model
def train_model(config_dict, k = 10):
    # configurations initialization

    config = Config(model='BPR', dataset='foursquare', config_file_list=['foursquare_general.yaml'], config_dict = config_dict)

    # init random seed
    init_seed(config['seed'], config['reproducibility'])
    # logger initialization
    init_logger(config)
    logger = getLogger()

    # write config info into log
    logger.info(config)

    # dataset creating and filtering
    dataset = create_dataset(config)
    logger.info(dataset)

    # dataset splitting
    train_data, valid_data, test_data = data_preparation(config, dataset)

    # model loading and initialization
    model = BPR(config, train_data.dataset).to(config['device'])
    logger.info(model)

    # trainer loading and initialization
    trainer = Trainer(config, model)

    # model training
    best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, verbose=False)

    # model evaluation
    test_result = trainer.evaluate(test_data)
    print(test_result)

    return model



# for each user, get the last n (history_length) visits
def get_history(interaction, history_length):
    visits = []
    set_uid = set(interaction['uid:token'])
    for u in set_uid:
        visits.append(interaction[interaction['uid:token'] == u]['venue_id:token'].values.tolist()[-history_length:])
    
    return visits


# predict the nest visit for all the users, select the k locations with highest score
def predict_k(model, users, visits, k): 
    #make prediction for users

    input_inter = Interaction({
        'uid': torch.tensor(users),
        'venue_id': torch.tensor(visits),
    })

    with torch.no_grad():
        scores = model.full_sort_predict(input_inter).reshape((len(users), -1))

    # get the 10 items with highest scores
    return np.argsort(scores, axis = 1)[:, -k:]



# select one item in an array a, get its correct id
def random_choice(a):
    # select one item, but then translate it back
    r_c = np.random.choice(a, 1)
    return int(r_c) - 1


# save the new interaction file
def new_atomic_files(interaction):
    try:
        interaction.to_csv('foursquare/foursquare.inter', index=False, sep = '\t')
        print('saved')
    except:
        print('Error saving the interaction file.')



In [8]:
preprocess()

k = 10
config_dict = {
    #environment settings
    'seed': 1234,
    'reproducibility': True,
    'data_path': os.getcwd(), 
    'topk': k
}

m = 3
MaxIt = 20
interaction = pd.read_csv('foursquare/foursquare.inter', sep='\t')

users = list(set(interaction['uid:token']))
locations = list(set(interaction['venue_id:token']))

i = 0

while i < MaxIt:
    print('')
    print('--------- iteration number: ', i)
    print('')

    if i % m == 0:  
        # train the model
        model = train_model(config_dict)
        training_history_length = max(interaction['timestamp:token'])+1
        print(training_history_length)

    #get the users' history
    visits = get_history(interaction, training_history_length)

    #recommendation list
    rec_list = predict_k(model, users, visits, k)

    #select one item in the list
    random_item = np.apply_along_axis(random_choice, 1, rec_list)

    current_time = max(interaction['timestamp:token'])+1
    ### update the training set
    new_locations = pd.DataFrame({'uid:token': users, 'venue_id:token':random_item.tolist(), 
                                    'timestamp:token':[current_time]*len(random_item)}, 
                                    columns=['uid:token', 'venue_id:token', 'timestamp:token'])
    
    interaction = pd.concat([interaction, new_locations], axis = 0).reset_index(drop = True)
    interaction.sort_values(by=['uid:token', 'timestamp:token'], inplace=True)

    new_atomic_files(interaction)

    card_loc = len(set(new_locations['venue_id:token']))
    

    i+=1


--------- iteration number:  0



18 May 11:00    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 1234
state = INFO
reproducibility = True
data_path = /Users/andreafrasson/Desktop/tesi/Feedback-Loop-for-POI-RS/foursquare
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096

OrderedDict([('recall@10', 0.398), ('mrr@10', 0.3578), ('ndcg@10', 0.3673), ('hit@10', 0.398), ('precision@10', 0.0398)])
100


  return int(r_c) - 1


saved

--------- iteration number:  1



  return int(r_c) - 1


saved

--------- iteration number:  2



  return int(r_c) - 1


saved

--------- iteration number:  3



18 May 11:00    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 1234
state = INFO
reproducibility = True
data_path = /Users/andreafrasson/Desktop/tesi/Feedback-Loop-for-POI-RS/foursquare
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 4096
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'LS': 'valid_and_test'}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096

OrderedDict([('recall@10', 0.0), ('mrr@10', 0.0), ('ndcg@10', 0.0), ('hit@10', 0.0), ('precision@10', 0.0)])
103


  return int(r_c) - 1


saved

--------- iteration number:  4



  return int(r_c) - 1


saved

--------- iteration number:  5



KeyboardInterrupt: 