In [2]:
import lightfm
import pandas as pd
import numpy as np

In [3]:
from lightfm.data import Dataset

In [4]:
reviewframe = pd.read_csv('./data/all_reviews_no_dupes.csv', index_col=[0])

In [5]:
print(reviewframe.shape)
reviewframe.head()

(2398712, 10)


Unnamed: 0,review_id,user_id,business_id,stars,date,name,city,state,categories,repeat_review_id
4786459,vRuKx3khNZZKM6nieI2nkw,r7-lUT62G15CXa9Wa6uvdA,hkNMo5kY8OhObwJF2qYIew,5,2019-12-13 15:51:19,Nirvana Center,phoenix,AZ,"Medical Centers, Shopping, Cannabis Dispensari...",r7-lUT62G15CXa9Wa6uvdAhkNMo5kY8OhObwJF2qYIew
4045013,HhRdgBVdTC3GzRszO79EIw,KJA805UOqiatClH8psYmuw,AE80fzfVnsyeHiiVhwSGMw,1,2019-12-13 15:50:11,Deluxburger Express,phoenix,AZ,"Burgers, Restaurants",KJA805UOqiatClH8psYmuwAE80fzfVnsyeHiiVhwSGMw
2648856,hhAhkWco5iarESCUpMQJJQ,h-aIa6eOkcwZYGQz8Ss02w,WHiND6UPxr6WzgGhz2E91Q,5,2019-12-13 15:45:53,Hash Kitchen,chandler,AZ,"Breakfast & Brunch, Restaurants",h-aIa6eOkcwZYGQz8Ss02wWHiND6UPxr6WzgGhz2E91Q
7042961,5m1rgmC6D0PXn1UGbo6gJw,Ia4iRpMZP_mGyozlUz6Q7g,XJIs0pWVNu60fhehIXgaUg,5,2019-12-13 15:45:49,Advanced Endodontics,mesa,AZ,"Health & Medical, Endodontists, Dentists",Ia4iRpMZP_mGyozlUz6Q7gXJIs0pWVNu60fhehIXgaUg
5950973,QBTdmDhRNcbUGU2WV06PiA,8yWDN0YsJC65quUz6LDuNA,9MVKjEMN5T59uzG1xoD2BQ,5,2019-12-13 15:43:16,Cocina Madrigal,phoenix,AZ,"Restaurants, Mexican",8yWDN0YsJC65quUz6LDuNA9MVKjEMN5T59uzG1xoD2BQ


In [6]:
# binary classification for ROC-AUC
reviewframe['recommend'] = reviewframe['stars'].map({1:-1, 2:-1, 3:-1, 4:1, 5:1})

In [7]:
reviewframe_with_dummy = reviewframe.copy()

In [8]:
newuser = pd.DataFrame({'review_id':'fake_review', 'user_id':'new_user', 'business_id':'dummy_business', 'city':'Nowhere', 'state':'NW', 
                        'stars':3, 'date':pd.to_datetime('01-01-2020'), 'name':'dummy business', 'categories':'',
                        'repeat_review_id':'new_userdummy_business', 'recommend':-1,'cat_list':''}, index=[0])

In [9]:
# insert a new user - for equivalence with other tests.
reviewframe_with_dummy = reviewframe_with_dummy.append(newuser, ignore_index=True)

In [10]:
# the lightFM Dataset object keeps track of mappings between user/business ids and indices on the generated sparse matrices
lightfm_dataset = Dataset()
lightfm_dataset.fit(reviewframe_with_dummy.user_id,reviewframe_with_dummy.business_id)

In [11]:
print(lightfm_dataset.interactions_shape())

(606928, 60541)


In [12]:
print(reviewframe.user_id.nunique())
print(reviewframe.business_id.nunique())

606927
60540


In [12]:
# build interactions
(lightfm_interactions, weights) = lightfm_dataset.build_interactions([(x['user_id'],
                                                       x['business_id'],
                                                       x['recommend']) for index,x in reviewframe_with_dummy.iterrows()])

In [13]:
seed = 42
from lightfm.cross_validation import random_train_test_split
train,test=random_train_test_split(lightfm_interactions,test_percentage=0.3,random_state=np.random.RandomState(seed))

print(f'{train.shape[0]} users, {train.shape[1]} businesses, {test.getnnz()} test interactions, {train.getnnz()} train interactions')

606928 users, 60541 businesses, 719614 test interactions, 1679099 train interactions


In [16]:
from lightfm import LightFM
from lightfm.evaluation import auc_score

component_train_accuracy = []
component_test_accuracy = []
num_components = [5,10,15,20,25,30]
num_threads = 4

for val in num_components:

    model = LightFM(loss='logistic',random_state=seed,
                no_components=val)
    model.fit(train,epochs=5,num_threads=num_threads)

    train_auc = auc_score(model, train, num_threads=num_threads).mean()
    print('train AUC: %s' % train_auc)
    component_train_accuracy.append(train_auc)

    test_auc = auc_score(model, test,num_threads=num_threads).mean()
    print('test AUC: %s' % test_auc)
    component_test_accuracy.append(test_auc)


NameError: name 'NUM_THREADS' is not defined

In [None]:
lr_train_accuracy = []
lr_test_accuracy = []
learning_rates = [5,10,15,20,25,30]
num_threads = 4

for val in learning_rates:

    model = LightFM(loss='logistic',random_state=seed,
                no_components=20, learning_rate=val)
    model.fit(train,epochs=5,num_threads=num_threads)

    train_auc = auc_score(model, train, num_threads=num_threads).mean()
    print('train AUC: %s' % train_auc)
    lr_train_accuracy.append(train_auc)

    test_auc = auc_score(model, test,num_threads=num_threads).mean()
    print('test AUC: %s' % test_auc)
    lr_test_accuracy.append(test_auc)

In [None]:
epochs_train_accuracy = []
epochs_test_accuracy = []
num_epochs = [5,10,15,20,25,30]
num_threads = 4

for val in num_epochs:

    model = LightFM(loss='logistic',random_state=seed,
                no_components=20, learning_rate=val)
    model.fit(train,epochs=val,num_threads=num_threads)

    train_auc = auc_score(model, train, num_threads=num_threads).mean()
    print('train AUC: %s' % train_auc)
    lr_train_accuracy.append(train_auc)

    test_auc = auc_score(model, test,num_threads=num_threads).mean()
    print('test AUC: %s' % test_auc)
    lr_test_accuracy.append(test_auc)