# Factorization Machine

## 1. Set-up
import dependent packages and declare consts

In [1]:
# package initialization

import pandas as pd
import numpy as np
from sklearn import metrics

from lightfm import LightFM
from lightfm.data import Dataset



In [31]:
# consts

DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
# DATASET = '../../data/extracted/NYC-R/New_York_City_Restaurant_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

USER_FEATURES = ['ugender', 'ucity', 'ucountry', 'ulevel']
ITEM_FEATURES = ['irating', 'itag']
# ITEM_FEATURES = ['irating', 'itag', 'iprice']

## 2. Read Dataset
read dataset in csv format into pandas.DataFrame

In [32]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [33]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'iid', 'rid', 'rimages', 'rquote', 'rrate', 'rtime', 'uprofile', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (136978, 29)


## 3. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item

In [34]:
def sort_by_time(df):
    
    # here we use 'rid' for sorting becaz it's auto incrementing
    return df.sort_values(by=['rid'], ascending=True)

In [35]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [36]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [37]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate'] + USER_FEATURES + ITEM_FEATURES]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df, df)

In [38]:
# dataset preprocessing

train_df, validation_df, test_df, whole_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (90209, 9)
validation set size:  (12887, 9)
test set size:  (33178, 9)


## 4. load into LightFM

In [39]:
# get list of all unique user ids
def get_unique_uids(df):
    return df['uid_index'].astype(str).unique()

# get user ids of each row in pandas.DataFrame
def get_uids(df):
    return df['uid_index'].astype(str).values

# get list of all unique user ids
def get_unique_iids(df):
    return df['iid'].astype(str).unique()

# get item ids of each row in pandas.DataFrame
def get_iids(df):
    return df['iid'].astype(str).values

# get list of all unique user features
def get_unique_user_features(df):
    sub_df = df[USER_FEATURES].astype(str)
    return np.unique(sub_df.values.flatten())

# get user features of each row in pandas.DataFrame
def get_user_features(df):
    sub_df = df[USER_FEATURES].astype(str)
    return sub_df.values

# get list of all unique item features
def get_unique_item_features(df):
    sub_df = df[ITEM_FEATURES].astype(str)
    return np.unique(sub_df.values.flatten())

# get item features of each row in pandas.DataFrame
def get_item_features(df):
    sub_df = df[ITEM_FEATURES].astype(str)
    return sub_df.values

# get list of tuples of user-item interactions
def get_iteractions(df, rating=False):
    
    if rating:
        columns = ['uid_index', 'iid', 'rrate']
    else:
        columns = ['uid_index', 'iid']
        
    df[['uid_index', 'iid']] = df[['uid_index', 'iid']].astype(str)
        
    rows = df[columns].values
    return (tuple(r) for r in rows)

In [40]:
print("# unique users:", len(get_unique_uids(train_df)))
print("# unique items:", len(get_unique_iids(train_df)))
print("# unique user features:", len(get_unique_user_features(train_df)))
print("# unique item features:", len(get_unique_item_features(train_df)))

# unique users: 16255
# unique items: 693
# unique user features: 3132
# unique item features: 628


create dataset

In [41]:
dataset = Dataset()

tell dataset what dimension of latent/user/item matrix to be used

In [42]:
dataset.fit(
    get_unique_uids(train_df),
    get_unique_iids(train_df),
    user_features = get_unique_user_features(train_df),
    item_features = get_unique_item_features(train_df)
)

In [43]:
num_users, num_items = dataset.interactions_shape()
print('Interaction matrix shape -- Num users: {} x num_items {}.'.format(num_users, num_items))

Interaction matrix shape -- Num users: 16255 x num_items 693.


build interaction matrix, which is the matrix storing user ratings

In [44]:
(interactions, weights) = dataset.build_interactions(get_iteractions(train_df, rating=True))

In [45]:
print(repr(interactions))
print(repr(weights))

<16255x693 sparse matrix of type '<class 'numpy.int32'>'
	with 90209 stored elements in COOrdinate format>
<16255x693 sparse matrix of type '<class 'numpy.float32'>'
	with 90209 stored elements in COOrdinate format>


build user feature matrix

In [46]:
uid_ufeats = ((uid, list(ufeats)) for uid, ufeats in zip(get_uids(train_df), get_user_features(train_df)) )

user_features = dataset.build_user_features(uid_ufeats)
print(repr(user_features))

<16255x19387 sparse matrix of type '<class 'numpy.float32'>'
	with 76735 stored elements in Compressed Sparse Row format>


build item feature matrix

In [47]:
iid_ifeats = ((iid, list(ifeats)) for iid, ifeats in zip(get_iids(train_df), get_item_features(train_df)) )

item_features = dataset.build_item_features(iid_ifeats)
print(repr(item_features))

<693x1321 sparse matrix of type '<class 'numpy.float32'>'
	with 2079 stored elements in Compressed Sparse Row format>


## 5. Model declaratoin & fitting

to tune model parameters, please check [lightfm.LightFM()](https://making.lyst.com/lightfm/docs/lightfm.html)

Parameters  
* no_components – the dimensionality of the feature latent embeddings. Defaults to `10`

In [None]:
# consts for model & training

LOSS = 'logistic'
LATENT=10

EPOCH = 30

In [None]:
model = LightFM(no_components=LATENT, loss=LOSS)
model.fit(interactions, item_features=item_features, user_features=user_features, epochs=EPOCH)

## 6. Prediction

In [48]:
def predict(model, dataframe):
    df = dataframe[['uid_index', 'iid', 'rrate']]
    z = []
    y = []
    for i in range(len(df)):
        user = df.iloc[i][0]
        item = df.iloc[i][1]
        rating = df.iloc[i][2]
        prediction = model.predict([user], [item])
        
        z.append(prediction[0])
        y.append(rating)
    return (np.array(z, dtype=np.float32), np.array(y, dtype=np.int))

In [None]:
train_df

In [None]:
validation_z, validation_y = predict(model, validation_df)
validation_z = (validation_z - np.min(validation_z)) / np.ptp(validation_z)

In [None]:
test_z, test_y = predict(model, test_df)
test_z = (test_z - np.min(test_z)) / np.ptp(test_z)

## 7. Evaluation

AUC metric

In [49]:
def evaluate_auc(z, y):
    return metrics.roc_auc_score(y, z)

In [None]:
print("validation AUC: ", evaluate_auc(validation_z, validation_y))
print("test AUC: ", evaluate_auc(test_z, test_y))

LogLoss metric

In [50]:
# assume parameters z & y are ndarray
def evaluate_logloss(z, y):
    zz = np.ones((z.shape[0], 2))
    zz[:, 0] -= z
    zz[:, 1] = z
    return metrics.log_loss(y, zz)

In [None]:
print("validation LogLoss: ", evaluate_logloss(validation_z, validation_y))
print("test LogLoss: ", evaluate_logloss(test_z, test_y))

NDCG metric

In [51]:
# assume parameters z & y are ndarray
def evaluate_ndcg(z, y):
    return metrics.ndcg_score(np.expand_dims(y, axis=0), np.expand_dims(z, axis=0), k=5)

In [None]:
print("validation NDCG@5: ", evaluate_ndcg(validation_z, validation_y))
print("test NDCG@5: ", evaluate_ndcg(test_z, test_y))

## Experiments

In [52]:
LOSS = 'logistic'
LATENT=10

EPOCH = 30

In [53]:
def train_latents(epoch=EPOCH, start=1, end=10, step=1):

    history = []
    for latent_n in range(start, end+step, step):
        
        print("Using latent size:", latent_n)
        model = LightFM(no_components=latent_n, loss=LOSS)
        model.fit(interactions, item_features=item_features, user_features=user_features, epochs=epoch)
        
        validation_z, validation_y = predict(model, validation_df)
        validation_z = (validation_z - np.min(validation_z)) / np.ptp(validation_z)
        test_z, test_y = predict(model, test_df)
        test_z = (test_z - np.min(test_z)) / np.ptp(test_z)
        
        history.append({
            'epoch': epoch,
            'latent': latent_n,
            'val_auc': evaluate_auc(validation_z, validation_y),
            'test_auc': evaluate_auc(test_z, test_y),
            'val_logloss': evaluate_logloss(validation_z, validation_y),
            'test_logloss': evaluate_logloss(test_z, test_y),
            'val_ndcg': evaluate_ndcg(validation_z, validation_y),
            'test_ndcg': evaluate_ndcg(test_z, test_y)
        })
    return history

In [54]:
history = train_latents(epoch=EPOCH, start=1, end=5, step=1)

Using latent size: 1
Using latent size: 2
Using latent size: 3
Using latent size: 4
Using latent size: 5


In [55]:
print("| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |")
print("|:-- | -- | -- | -- | -- | -- | -- |")
for his in history:
    print("| latent_n={} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} | {:.5f} |".format(
        his['latent'],
        his['val_auc'],
        his['val_logloss'],
        his['val_ndcg'],
        his['test_auc'],
        his['test_logloss'],
        his['test_ndcg'],
    ))

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| latent_n=1 | 0.50097 | 1.46479 | 1.00000 | 0.49738 | 1.97449 | 0.27727 |
| latent_n=2 | 0.50390 | 2.01784 | 1.00000 | 0.50333 | 3.12638 | 0.44685 |
| latent_n=3 | 0.50260 | 2.12201 | 1.00000 | 0.50200 | 3.41549 | 0.27727 |
| latent_n=4 | 0.50504 | 2.42503 | 1.00000 | 0.50610 | 3.90011 | 0.27727 |
| latent_n=5 | 0.49486 | 2.61812 | 1.00000 | 0.48516 | 3.99912 | 0.44685 |


## Experiment Results

LON-A dataset:  

| settings | validation AUC | validation LogLoss | validation NDCG@5 | testing AUC | testing LogLoss | testing NDCG@5 |
|:-- | -- | -- | -- | -- | -- | -- |
| **latent_n=1*** | 0.50749 | 2.67413 | 0.66084 | 0.49727 | 2.15446 | 0.66084 |
| latent_n=2 | 0.50484 | 2.79746 | 0.66084 | 0.49862 | 2.95496 | 0.44685 |
| latent_n=3 | 0.50840 | 3.45294 | 0.66084 | 0.50411 | 3.46178 | 0.27727 |
| latent_n=4 | 0.49764 | 3.70850 | 0.66084 | 0.49985 | 3.64026 | 0.27727 |
| latent_n=5 | 0.48261 | 3.67404 | 0.66084 | 0.48091 | 3.81011 | 0.27727 |
| latent_n=6 | 0.50135 | 4.22715 | 0.66084 | 0.49615 | 4.19156 | 0.27727 |
| latent_n=7 | 0.50788 | 4.42946 | 0.66084 | 0.50081 | 4.46515 | 0.27727 |
| latent_n=8 | 0.49772 | 4.40505 | 0.66084 | 0.48850 | 4.60153 | 0.27727 |
| latent_n=9 | 0.50088 | 4.71791 | 0.66084 | 0.49938 | 4.48787 | 0.27727 |
| latent_n=10 | 0.50002 | 4.66355 | 0.66084 | 0.49069 | 4.69170 | 0.27727 |