## Matrix Factorization with BPR

In [1]:
# package initialization

import pandas as pd
import numpy as np
import torch.nn as nn
import os
import torch
from spotlight.datasets import _transport
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.factorization.implicit import ImplicitFactorizationModel
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, ndcg_score

### Read Data

In [2]:
DATASET_LON = '../data/LON-A/London_Attractions_Complete_Review.csv'
DATASET_NYC = '../data/NYC-R/New_York_City_Restaurant_Complete_Review.csv'

In [3]:
def sort_by_time(df):
    return df.sort_values(by=['rtime'], ascending=True)

In [4]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [5]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [6]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate']]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', 5)
    df = filter_by_occurrence(df, 'uid_index', 5)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)
    #return df

In [7]:
def _get_my_own(dataset):
    
    usuarios = dataset['uid_index'].to_numpy()
    items = dataset['iid'].to_numpy()
    ratings = dataset['rrate'].to_numpy()
    
    return (usuarios, items, ratings)

def get_my_own_dataset(data):
    """
    Returns
    -------

    Interactions: :class:`spotlight.interactions.Interactions`
        instance of the interactions class
    """
    return Interactions(*_get_my_own(data))

In [8]:
def random_train_test_split(interactions,
                            test_percentage=0.2,
                            random_state=None):
    """
    Randomly split interactions between training and testing.
    Parameters
    ----------
    interactions: :class:`spotlight.interactions.Interactions`
        The interactions to shuffle.
    test_percentage: float, optional
        The fraction of interactions to place in the test set.
    random_state: np.random.RandomState, optional
        The random state used for the shuffle.
    Returns
    -------
    (train, test): (:class:`spotlight.interactions.Interactions`,
                    :class:`spotlight.interactions.Interactions`)
         A tuple of (train data, test data)
    """

    interactions = shuffle_interactions(interactions,
                                        random_state=random_state)

    cutoff = int((1.0 - test_percentage) * len(interactions))

    train_idx = slice(None, cutoff)
    test_idx = slice(cutoff, None)

    train = Interactions(interactions.user_ids[train_idx],
                         interactions.item_ids[train_idx],
                         ratings=_index_or_none(interactions.ratings,
                                                train_idx),
                         timestamps=_index_or_none(interactions.timestamps,
                                                   train_idx),
                         weights=_index_or_none(interactions.weights,
                                                train_idx),
                         num_users=interactions.num_users,
                         num_items=interactions.num_items)
    test = Interactions(interactions.user_ids[test_idx],
                        interactions.item_ids[test_idx],
                        ratings=_index_or_none(interactions.ratings,
                                               test_idx),
                        timestamps=_index_or_none(interactions.timestamps,
                                                  test_idx),
                        weights=_index_or_none(interactions.weights,
                                               test_idx),
                        num_users=interactions.num_users,
                        num_items=interactions.num_items)

    return train, test

In [9]:
# dataset preprocessing

df = pd.read_csv(DATASET_LON, sep='\t')

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (90209, 3)
validation set size:  (12887, 3)
test set size:  (33178, 3)


In [10]:
train_dataset = get_my_own_dataset(train_df)
val_dataset = get_my_own_dataset(validation_df)
test_dataset = get_my_own_dataset(test_df)

### BPR‐MF

In [16]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train_dataset)

In [12]:
model = ImplicitFactorizationModel(loss='bpr',
                                   embedding_dim=128,  # latent dimensionality
                                   n_iter=5,          # number of epochs of training
                                   batch_size=128,     # minibatch size
                                   l2=1e-9,            # strength of L2 regularization
                                   learning_rate=1e-3)

model.fit(train_dataset)

In [13]:
val_predictions = model.predict(val_dataset.user_ids, val_dataset.item_ids)
test_predictions = model.predict(test_dataset.user_ids, test_dataset.item_ids)

In [14]:
val_predictions

array([ 9.1717205,  6.0094385,  6.8720193, ...,  2.4474103, 10.612924 ,
        0.6243647], dtype=float32)

In [15]:
val_preds = torch.FloatTensor(val_predictions)
val_preds = nn.Sigmoid()(val_preds).numpy()
val_preds_b = (val_preds > 0.5).astype(float)

test_preds = torch.FloatTensor(test_predictions)
test_preds = nn.Sigmoid()(test_preds).numpy()
test_preds_b = (test_preds > 0.5).astype(float)

In [16]:
val_y = val_dataset.ratings
test_y = test_dataset.ratings

### Evaluation

In [20]:
roc_auc = metrics.roc_auc_score(np.array(val_y), np.array(val_preds))
logloss = metrics.log_loss(val_y, val_preds.astype('float64'))
ndcg_val = ndcg_score(np.expand_dims(np.array(val_y), axis=0), np.expand_dims(np.array(val_preds), axis=0), k=5)
print("validation roc_auc:",roc_auc)
print('validation log_loss scores:', logloss)
print('validation NDCG@5 scores:', ndcg_val)

roc_auc = metrics.roc_auc_score(np.array(test_y), np.array(test_preds))
logloss = metrics.log_loss(test_y, test_preds.astype('float64'))
ndcg_test = ndcg_score(np.expand_dims(np.array(test_y), axis=0), np.expand_dims(np.array(test_preds), axis=0), k=5)
print("\ntest roc_auc:",roc_auc)
print('test log_loss scores:', logloss)
print('test NDCG@5 scores:', ndcg_test)

validation roc_auc: 0.5034873528958788
validation log_loss scores: 1.5622212511405003
validation NDCG@5 scores: 0.8634146341463413

test roc_auc: 0.5082576589801349
test log_loss scores: 1.6007943182165427
test NDCG@5 scores: 0.8273381294964027
