# Matrix Factorization

## 1. Set-up
import dependent packages and declare consts

In [1]:
# package initialization

import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from sklearn import metrics

In [23]:
# consts

# DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
DATASET = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

## 2. Read Dataset
read dataset in csv format into pandas.DataFrame

In [24]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [25]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'rtime', 'rquote', 'iid', 'rrate', 'rid', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'iprice', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (129964, 28)


## 3. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item

In [5]:
def sort_by_time(df):
    
    # here we use 'rid' for sorting becaz it's auto incrementing
    return df.sort_values(by=['rid'], ascending=True)

In [6]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [7]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [8]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate']]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)

In [26]:
# dataset preprocessing

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (80722, 3)
validation set size:  (11532, 3)
test set size:  (29517, 3)


## 4. Load into Surprise

In [27]:
reader = Reader(rating_scale=(0, 1))
train_dataset = Dataset.load_from_df(train_df, reader).build_full_trainset()

## 5. Model declaration & Fitting

The prediction $\hat{r}_{ui}$ is set as:  

\begin{equation}
\hat{r}_{ui} = \mu + b_u + b_i + q_i^Tp_u
\end{equation}

$b_u$ is bias of user $u$, $b_i$ is bias for item $i$  
$q$ and $p$ are latent vector of item and user respectively.  

The optimization objective is:  

\begin{equation}
\sum_{r_{ui} \in R_{train}} \left(r_{ui} - \hat{r}_{ui} \right)^2 +
\lambda\left(b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2\right)
\end{equation}

`note` what parameters we're going to train?  
- bias for item
- bias for user
- elements of user's latent vector
- elements of item's latent vector

please check [Surprise | Matrix Factorization](https://surprise.readthedocs.io/en/stable/matrix_factorization.html) for more detail about parameter tuning

In [11]:
# parameters for matrix factorization
# here we adpot those defaults of library Surprise

LATENT_SIZE = 100 # latent vector's dimension, defaults to 100
EPOCH = 20 # training iteration, defaults to 20
BIASED = True # whether to add bias into user/item's latent vector, defaults to True
INIT_MEAN = 0 # mean of the normal distribution for factor vectors initialization, defaults to 0
INIT_STDDEV = 0.1 # standard deviation of the normal distribution for factor vectors initialization, defaults to 0.1
LR_ALL = 0.005 # learning rate for all parameters, defaults to 0.005
REG_ALL = 0.02 # regularization term for all parameters, defaults to 0.02

In [12]:
algorithm = SVD(
    n_factors = LATENT_SIZE,
    n_epochs = EPOCH,
    biased = BIASED,
    init_mean = INIT_MEAN,
    init_std_dev = INIT_STDDEV,
    lr_all = LR_ALL,
    reg_all = REG_ALL
)

In [13]:
algorithm.fit(train_dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x290291dbfc8>

## 6. Prediction

In [11]:
def predict(algorithm, dataframe):
    z = []
    y = []
    for i in range(len(dataframe)):
        user = dataframe.iloc[i][0]
        item = dataframe.iloc[i][1]
        rating = dataframe.iloc[i][2]
        prediction = algorithm.predict(user, item, r_ui=rating, verbose=False)
        
        if prediction.details['was_impossible'] == False:
            z.append(prediction.est)
            y.append(rating)
    return (np.array(z, dtype=np.float32), np.array(y, dtype=np.int))

In [15]:
validation_z, validation_y = predict(algorithm, validation_df)

In [16]:
test_z, test_y = predict(algorithm, test_df)

## 7. Evaluation

AUC metric

In [12]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

In [18]:
print("validation AUC: ", evaluate_auc(validation_z, validation_y))
print("test AUC: ", evaluate_auc(test_z, test_y))

validation AUC:  0.99830901379943
test AUC:  0.9967822284449699


LogLoss metric

In [13]:
# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zz = np.ones((z.shape[0], 2))
    zz[:, 0] -= z
    zz[:, 1] = z
    return metrics.log_loss(y, zz)

In [20]:
print("validation LogLoss: ", evaluate_logloss(validation_z, validation_y))
print("test LogLoss: ", evaluate_logloss(test_z, test_y))

validation LogLoss:  0.05650602974289275
test LogLoss:  0.07532174375929619


NDCG metric

In [14]:
# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), np.expand_dims(z, axis=0), k=5)

In [22]:
print("validation NDCG@5: ", evaluate_ndcg(validation_z, validation_y))
print("test NDCG@5: ", evaluate_ndcg(test_z, test_y))

validation NDCG@5:  0.9999999999999999
test NDCG@5:  0.999892101855848


## Experiments

In [15]:
LATENT_SIZE = 100 # latent vector's dimension, defaults to 100
EPOCH = 20 # training iteration, defaults to 20
BIASED = True # whether to add bias into user/item's latent vector, defaults to True
INIT_MEAN = 0 # mean of the normal distribution for factor vectors initialization, defaults to 0
INIT_STDDEV = 0.1 # standard deviation of the normal distribution for factor vectors initialization, defaults to 0.1
LR_ALL = 0.005 # learning rate for all parameters, defaults to 0.005
REG_ALL = 0.02 # regularization term for all parameters, defaults to 0.02

In [18]:
def train_latents(epoch=20, start=20, end=200, step=20):

    history = []
    for latent_size in range(start, end+step, step):
        
        print("Using Latent size: ", latent_size)
        algorithm = SVD(
            n_factors = latent_size,
            n_epochs = epoch,
            biased = BIASED,
            init_mean = INIT_MEAN,
            init_std_dev = INIT_STDDEV,
            lr_all = LR_ALL,
            reg_all = REG_ALL
        )
        
        algorithm.fit(train_dataset)
        validation_z, validation_y = predict(algorithm, validation_df)
        test_z, test_y = predict(algorithm, test_df)
        
        history.append({
            'factor': latent_size,
            'val_auc': evaluate_auc(validation_z, validation_y),
            'test_auc': evaluate_auc(test_z, test_y),
            'val_logloss': evaluate_logloss(validation_z, validation_y),
            'test_logloss': evaluate_logloss(test_z, test_y),
            'val_ndcg': evaluate_ndcg(validation_z, validation_y),
            'test_ndcg': evaluate_ndcg(test_z, test_y)
        })
    return history

In [28]:
history = train_latents(epoch=20, start=2, end=18, step=4)

Using Latent size:  2
Using Latent size:  6
Using Latent size:  10
Using Latent size:  14
Using Latent size:  18


In [29]:
print("| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| factor_n={} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['factor'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| factor_n=2 | 0.99937 | 0.09109 | 1.00000 | 0.99914 | 0.09920 | 0.99959 |
| factor_n=6 | 0.99939 | 0.09199 | 1.00000 | 0.99901 | 0.10006 | 0.99980 |
| factor_n=10 | 0.99925 | 0.09445 | 1.00000 | 0.99898 | 0.10414 | 0.99924 |
| factor_n=14 | 0.99920 | 0.09580 | 1.00000 | 0.99904 | 0.10371 | 0.99963 |
| factor_n=18 | 0.99927 | 0.09664 | 1.00000 | 0.99892 | 0.10711 | 0.99927 |


## Experiment Results

Fixed hyperparameters:

```python
EPOCH = 20
BIASED = True
INIT_MEAN = 0
INIT_STDDEV = 0.1
LR_ALL = 0.005
REG_ALL = 0.02
```

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| **factor_n=2*** | 0.99841 | 0.05098 | 1.00000 | 0.99739 | 0.06513 | 0.99985 |
| factor_n=6 | 0.99820 | 0.05361 | 0.99962 | 0.99736 | 0.06490 | 1.00000 |
| factor_n=10 | 0.99774 | 0.05501 | 0.99964 | 0.99746 | 0.06523 | 1.00000 |
| factor_n=14 | 0.99815 | 0.05452 | 0.99965 | 0.99736 | 0.06574 | 0.99987 |
| factor_n=18 | 0.99800 | 0.05710 | 0.99931 | 0.99726 | 0.06809 | 0.99987 |
| factor_n=20 | 0.99809 | 0.05508 | 0.99966 | 0.99708 | 0.07104 | 0.99949 |
| factor_n=40 | 0.99790 | 0.05916 | 0.99937 | 0.99737 | 0.06787 | 1.00000 |
| factor_n=60 | 0.99773 | 0.06047 | 0.99939 | 0.99711 | 0.07147 | 0.99989 |
| factor_n=80 | 0.99784 | 0.05947 | 0.99939 | 0.99667 | 0.07446 | 0.99989 |
| factor_n=100 | 0.99803 | 0.05862 | 0.99970 | 0.99660 | 0.07951 | 0.99957 |

NYC-R dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| **factor_n=2*** | 0.99937 | 0.09109 | 1.00000 | 0.99914 | 0.09920 | 0.99959 |
| factor_n=6 | 0.99939 | 0.09199 | 1.00000 | 0.99901 | 0.10006 | 0.99980 |
| factor_n=10 | 0.99925 | 0.09445 | 1.00000 | 0.99898 | 0.10414 | 0.99924 |
| factor_n=14 | 0.99920 | 0.09580 | 1.00000 | 0.99904 | 0.10371 | 0.99963 |
| factor_n=18 | 0.99927 | 0.09664 | 1.00000 | 0.99892 | 0.10711 | 0.99927 |