### Surprise Baseline Method

## 1. Set-up
import dependent packages and declare consts

In [1]:
# package initialization

import pandas as pd
import numpy as np
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from sklearn import metrics

In [234]:
# consts

# DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

## 2. Read Dataset
read dataset in csv format into pandas.DataFrame

In [235]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [236]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'rtime', 'rquote', 'iid', 'rrate', 'rid', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'iprice', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (129964, 28)


## 3. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item

In [237]:
def sort_by_time(df):
    
    # here we use 'rid' for sorting becaz it's auto incrementing
    return df.sort_values(by=['rid'], ascending=True)

In [238]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [239]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [240]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate']]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)

In [241]:
# dataset preprocessing

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (80722, 3)
validation set size:  (11532, 3)
test set size:  (29517, 3)


## 4. Load into Surprise

In [242]:
reader = Reader(rating_scale=(0, 1))
train_dataset = Dataset.load_from_df(train_df, reader).build_full_trainset()

## 5. Model declaration & Fitting

**Prediction**

\begin{equation}
b_{ui} = (\mu + b_u + b_i)
\end{equation}

where $b_{ui}$ is the predicted rating by user $u$ to item $i$

**Optimization objective**

\begin{equation}
\sum_{r_{ui} \in R_{train}} \left(r_{ui} - (\mu + b_u + b_i)\right)^2 +
\lambda \left(b_u^2 + b_i^2 \right)
\end{equation}

**Parameters (using SGD)**
- `reg`: The regularization parameter of the cost function that is optimized, defaults to `0.02`
- `learning_rate`: The learning rate of SGD, defaults to `0.005`
- `n_epochs`: The number of iteration of the SGD procedure. Default is `20`.

model configuration: [Surprise | Baseline configuration](https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#baseline-estimates-configuration)

In [214]:
# consts
LEARNING_RATE = 0.0005
EPOCH = 10
REGULARIZATION = 0

In [195]:
# declare model
bsl_options = {'method': 'sgd',
               'learning_rate': LEARNING_RATE,
               'n_epochs': EPOCH,
               'reg': REGULARIZATION
               }
algorithm = BaselineOnly(bsl_options=bsl_options)

In [200]:
algorithm.fit(train_dataset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x27056387d88>

## 6. Prediction

In [15]:
def predict(algorithm, dataframe):
    z = []
    y = []
    for i in range(len(dataframe)):
        user = dataframe.iloc[i][0]
        item = dataframe.iloc[i][1]
        rating = dataframe.iloc[i][2]
        prediction = algorithm.predict(user, item, r_ui=rating, verbose=False)
        
        if prediction.details['was_impossible'] == False:
            z.append(prediction.est)
            y.append(rating)
    return (np.array(z, dtype=np.float32), np.array(y, dtype=np.int))

In [201]:
validation_z, validation_y = predict(algorithm, validation_df)

In [202]:
test_z, test_y = predict(algorithm, test_df)

## 7. Evaluation

AUC metric

In [27]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

In [204]:
print("validation AUC: ", evaluate_auc(validation_z, validation_y))
print("test AUC: ", evaluate_auc(test_z, test_y))

validation AUC:  0.9950688258953624
test AUC:  0.9918595553439749


LogLoss metric

In [29]:
# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zz = np.ones((z.shape[0], 2))
    zz[:, 0] -= z
    zz[:, 1] = z
    return metrics.log_loss(y, zz)

In [192]:
print("validation LogLoss: ", evaluate_logloss(validation_z, validation_y))
print("test LogLoss: ", evaluate_logloss(test_z, test_y))

validation LogLoss:  0.05278888783940757
test LogLoss:  0.07449852962532956


NDCG metric

In [31]:
# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), np.expand_dims(z, axis=0), k=5)

In [193]:
print("validation NDCG@5: ", evaluate_ndcg(validation_z, validation_y))
print("test NDCG@5: ", evaluate_ndcg(test_z, test_y))

validation NDCG@5:  0.9997153430116709
test NDCG@5:  0.9998784933171323


## Experiments

In [217]:
# consts
LEARNING_RATE = 0.005
EPOCH = 200
REGULARIZATION = 0

In [231]:
def train_regs(lr=LEARNING_RATE, epoch=EPOCH, start=0, end=0.06, step=0.02):

    history = []
    reg = start
    while reg <= end:
        bsl_options = {'method': 'sgd',
                       'learning_rate': lr,
                       'n_epochs': epoch,
                       'reg': reg
                       }
        print("Training model using reg ", reg)
        algorithm = BaselineOnly(bsl_options=bsl_options)
        algorithm.fit(train_dataset)
        validation_z, validation_y = predict(algorithm, validation_df)
        test_z, test_y = predict(algorithm, test_df)
        
        history.append({
            'epoch': epoch,
            'reg': reg,
            'val_auc': evaluate_auc(validation_z, validation_y),
            'test_auc': evaluate_auc(test_z, test_y),
            'val_logloss': evaluate_logloss(validation_z, validation_y),
            'test_logloss': evaluate_logloss(test_z, test_y),
            'val_ndcg': evaluate_ndcg(validation_z, validation_y),
            'test_ndcg': evaluate_ndcg(test_z, test_y)
        })
        reg += step
    return history

In [246]:
history = train_regs(lr=0.005, epoch=200, start=0, end=0.06, step=0.02)

Training model using reg  0
Estimating biases using sgd...
Training model using reg  0.02
Estimating biases using sgd...
Training model using reg  0.04
Estimating biases using sgd...
Training model using reg  0.06
Estimating biases using sgd...


In [None]:
print("| reg | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| {} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['reg'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

In [243]:
def train_epochs(lr=LEARNING_RATE, reg=REGULARIZATION, start=50, end=200, step=50):
    
    history = []
    for epoch in range(start, end+step, step):
        
        bsl_options = {'method': 'sgd',
                       'learning_rate': lr,
                       'n_epochs': epoch,
                       'reg': reg
                       }
        print("Training model using epoch ", epoch)
        algorithm = BaselineOnly(bsl_options=bsl_options)
        algorithm.fit(train_dataset)
        validation_z, validation_y = predict(algorithm, validation_df)
        test_z, test_y = predict(algorithm, test_df)
        
        history.append({
            'epoch': epoch,
            'val_auc': evaluate_auc(validation_z, validation_y),
            'test_auc': evaluate_auc(test_z, test_y),
            'val_logloss': evaluate_logloss(validation_z, validation_y),
            'test_logloss': evaluate_logloss(test_z, test_y),
            'val_ndcg': evaluate_ndcg(validation_z, validation_y),
            'test_ndcg': evaluate_ndcg(test_z, test_y)
        })
    return history

In [244]:
history = train_epochs(lr=LEARNING_RATE, reg=REGULARIZATION, start=50, end=200, step=50)

Training model using epoch  50
Estimating biases using sgd...
Training model using epoch  100
Estimating biases using sgd...
Training model using epoch  150
Estimating biases using sgd...
Training model using epoch  200
Estimating biases using sgd...


In [None]:
print("| epochs | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| {} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['epoch'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

## Experiment Results

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| lr=5e-3, epoch=100, reg=0 | 0.99829 | 0.04770 | 0.99973 | 0.99727 | 0.06989 | 0.99989 |
| lr=5e-3, epoch=150, reg=0 | 0.99837 | 0.04543 | 0.99974 | 0.99741 | 0.06559 | 0.99989 |
| **lr=5e-3, epoch=200, reg=0** | 0.99837 | 0.04655 | 0.99975 | 0.99748 | 0.06342 | 0.99990 |
| lr=5e-3, epoch=200, reg=0.02 | 0.99787 | 0.05517 | 0.99964 | 0.99714 | 0.07020 | 0.99986 |
| lr=5e-3, epoch=200, reg=0.04 | 0.99784 | 0.06068 | 0.99952 | 0.99711 | 0.07267 | 0.99982 |
| lr=5e-3, epoch=200, reg=0.06 | 0.99781 | 0.06643 | 0.99939 | 0.99708 | 0.07520 | 0.99977 |

NYC-R dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| lr=5e-3, epoch=100, reg=0 | 0.99868 | 0.13270 | 1.00000 | 0.99873 | 0.14736 | 0.99939 |
| lr=5e-3, epoch=150, reg=0 | 0.99874 | 0.10398 | 0.99950 | 0.99893 | 0.11389 | 0.99958 |
| **lr=5e-3, epoch=200, reg=0** | 0.99880 | 0.08598 | 0.99959 | 0.99901 | 0.09440 | 0.99966 |
| lr=5e-3, epoch=200, reg=0.02 | 0.99958 | 0.02471 | 1.00000 | 0.99924 | 0.02754 | 0.99936 |
| lr=5e-3, epoch=200, reg=0.04 | 0.99955 | 0.03267 | 1.00000 | 0.99922 | 0.03440 | 0.99942 |
| lr=5e-3, epoch=200, reg=0.06 | 0.99952 | 0.04055 | 1.00000 | 0.99918 | 0.04218 | 0.99929 |