## Factorization Machine

using dataset LON-A

#### 1. Set-up
import dependent packages and declare consts

In [2]:
# package initialization

import pandas as pd
import numpy as np
from sklearn import metrics

In [56]:
# consts

DATASET = '../../data/extracted/LON-A/London_Attractions_Complete_Review.csv'
OCCURENCE_THRESHOLD = 5

USER_FEATURES = ['ugender', 'ucity', 'ucountry', 'ulevel']
ITEM_FEATURES = ['iattribute', 'irating', 'itag']

#### 2. Read Dataset
read dataset in csv format into pandas.DataFrame

In [4]:
# read dataset

df = pd.read_csv(DATASET, sep='\t')

In [5]:
# print dataset information

print("Columns: \n", list(df.columns))
print("\nShape: \n", df.shape)

Columns: 
 ['Unnamed: 0', 'Unnamed: 0.1', 'iid', 'rid', 'rimages', 'rquote', 'rrate', 'rtime', 'uprofile', 'uage', 'ucity', 'ucountry', 'ugender', 'uhometown', 'uid_index', 'ulevel', 'uname_y', 'usince', 'ustate', 'ustyle', 'iattribute', 'ilocality', 'iname', 'ipopularity', 'ipost', 'irating', 'iregion', 'istreet', 'itag']

Shape: 
 (136978, 29)


#### 3. Data Preprocessing

* Retain users/items with at least five ratings only
* Data splitting
  - the latest 20% interactions (by time)
  - Randomly split the remaining data into training (70%) and validation (10%) sets
* Transform the ratings into binary implicit feedback as ground truth, indicating whether the user has interacted with the specific item

In [6]:
def sort_by_time(df):
    
    # here we use 'rid' for sorting becaz it's auto incrementing
    return df.sort_values(by=['rid'], ascending=True)

In [7]:
def filter_by_occurrence(df, column, threshold):
    return df.groupby(column).filter(lambda x: len(x) >= threshold)

In [8]:
def convert_binary(df):
    df.loc[df['rrate'] != "None", 'rrate'] = 1.0
    df.loc[df['rrate'] == "None", 'rrate'] = 0.0
    return df

In [58]:
def data_preprocess(dataframe):
    
    # sort by time (ascending order)
    df = sort_by_time(dataframe)
    
    # retrieve needed columns
    df = df[['uid_index', 'iid', 'rrate'] + USER_FEATURES + ITEM_FEATURES]
    
    # convert ratings into binarys
    df = convert_binary(df)
    
    df['rrate'] = pd.to_numeric(df['rrate'])
    
    # Retain users/items with at least five ratings only
    df = filter_by_occurrence(df, 'iid', OCCURENCE_THRESHOLD)
    df = filter_by_occurrence(df, 'uid_index', OCCURENCE_THRESHOLD)
    
    # split dataset into training set, validation set and test set
    users = df.groupby('uid_index')
    
    test_df = pd.DataFrame()
    train_validation_df = pd.DataFrame()
    
    # for each user, get its latest 20% rating as test set
    for uid in users.size().to_dict().keys():
        user = users.get_group(uid)
        split_idx = int(len(user)*0.8)
        test_df = test_df.append(user.iloc[split_idx:])
        train_validation_df = train_validation_df.append(user.iloc[:split_idx])
    
    train_validation_df = train_validation_df.reindex(np.random.permutation(train_validation_df.index)) # shuffle
    train_df = train_validation_df.iloc[:int(len(train_validation_df)*0.875)]
    validation_df = train_validation_df.iloc[int(len(train_validation_df)*0.875):]
    
    return (train_df, validation_df, test_df)

In [17]:
# dataset preprocessing

train_df, validation_df, test_df = data_preprocess(df)
print("training set size: ", train_df.shape)
print("validation set size: ", validation_df.shape)
print("test set size: ", test_df.shape)

training set size:  (90209, 10)
validation set size:  (12887, 10)
test set size:  (33178, 10)


#### 4. load into LightFM

In [18]:
from lightfm import LightFM
from lightfm.data import Dataset



In [160]:
def get_unique_uids(df):
    return df['uid_index'].astype(str).unique()

In [161]:
def get_uids(df):
    return df['uid_index'].astype(str).values

In [162]:
def get_iids(df):
    return df['iid'].astype(str).values

In [163]:
def get_unique_iids(df):
    return df['iid'].astype(str).unique()

In [164]:
def get_unique_user_features(df):
    sub_df = df[USER_FEATURES].astype(str)
    return np.unique(sub_df.values.flatten())

In [165]:
def get_user_features(df):
    sub_df = df[USER_FEATURES].astype(str)
    return sub_df.values

In [166]:
def get_unique_item_features(df):
    sub_df = df[ITEM_FEATURES].astype(str)
    return np.unique(sub_df.values.flatten())

In [167]:
def get_item_features(df):
    sub_df = df[ITEM_FEATURES].astype(str)
    return sub_df.values

In [168]:
def get_iteractions(df, rating=False):
    
    if rating:
        columns = ['uid_index', 'iid', 'rrate']
    else:
        columns = ['uid_index', 'iid']
        
    rows = df[columns].astype(str).values
    return (tuple(r) for r in rows)

In [169]:
print("# unique users:", len(get_unique_uids(train_df)))
print("# unique items:", len(get_unique_iids(train_df)))
print("# unique user features:", len(get_unique_user_features(train_df)))
print("# unique item features:", len(get_unique_item_features(train_df)))

# unique users: 16258
# unique items: 691
# unique user features: 3133
# unique item features: 822


In [170]:
dataset = Dataset()

tell dataset what dimension of latent/user/item matrix to be used

In [171]:
dataset.fit(
    get_unique_uids(train_df),
    get_unique_iids(train_df),
    user_features = get_unique_user_features(train_df),
    item_features = get_unique_item_features(train_df)
)

In [172]:
num_users, num_items = dataset.interactions_shape()
print('Interaction matrix shape -- Num users: {} x num_items {}.'.format(num_users, num_items))

Interaction matrix shape -- Num users: 16258 x num_items 691.


build interaction matrix, which is the matrix storing user ratings

In [173]:
(interactions, weights) = dataset.build_interactions(get_iteractions(train_df, rating=False))

In [174]:
print(repr(interactions))
print(repr(weights))

<16258x691 sparse matrix of type '<class 'numpy.int32'>'
	with 90209 stored elements in COOrdinate format>
<16258x691 sparse matrix of type '<class 'numpy.float32'>'
	with 90209 stored elements in COOrdinate format>


build user feature matrix

In [175]:
uid_ufeats = ((uid, list(ufeats)) for uid, ufeats in zip(get_uids(train_df), get_user_features(train_df)) )

user_features = dataset.build_user_features(uid_ufeats)
print(repr(user_features))

<16258x19391 sparse matrix of type '<class 'numpy.float32'>'
	with 76750 stored elements in Compressed Sparse Row format>


build item feature matrix

In [178]:
iid_ifeats = ((iid, list(ifeats)) for iid, ifeats in zip(get_iids(train_df), get_item_features(train_df)) )

item_features = dataset.build_item_features(iid_ifeats)
print(repr(item_features))

<691x1513 sparse matrix of type '<class 'numpy.float32'>'
	with 2764 stored elements in Compressed Sparse Row format>


#### reference code

In [27]:
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((x['User-ID'] for x in get_ratings()),
            (x['ISBN'] for x in get_ratings()))

In [28]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 105283, num_items 340553.


In [29]:
dataset.fit_partial(items=(x['ISBN'] for x in get_book_features()),
                    item_features=(x['Book-Author'] for x in get_book_features()))

In [30]:
(interactions, weights) = dataset.build_interactions(((x['User-ID'], x['ISBN'])
                                                      for x in get_ratings()))

print(repr(interactions))

<105283x341762 sparse matrix of type '<class 'numpy.int32'>'
	with 1149780 stored elements in COOrdinate format>


In [31]:
item_features = dataset.build_item_features(((x['ISBN'], [x['Book-Author']])
                                              for x in get_book_features()))
print(repr(item_features))

<341762x443805 sparse matrix of type '<class 'numpy.float32'>'
	with 613141 stored elements in Compressed Sparse Row format>


In [32]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)

<lightfm.lightfm.LightFM at 0x2940e3bd488>