In [1]:
import lightfm
import pandas as pd
import numpy as np

from random import sample

In [2]:
from lightfm.data import Dataset

In [3]:
reviewframe = pd.read_csv('./data/all_reviews_no_dupes.csv', index_col=[0])

In [4]:
print(reviewframe.shape)
reviewframe.head()

(2398712, 10)


Unnamed: 0,review_id,user_id,business_id,stars,date,name,city,state,categories,repeat_review_id
4786459,vRuKx3khNZZKM6nieI2nkw,r7-lUT62G15CXa9Wa6uvdA,hkNMo5kY8OhObwJF2qYIew,5,2019-12-13 15:51:19,Nirvana Center,phoenix,AZ,"Medical Centers, Shopping, Cannabis Dispensari...",r7-lUT62G15CXa9Wa6uvdAhkNMo5kY8OhObwJF2qYIew
4045013,HhRdgBVdTC3GzRszO79EIw,KJA805UOqiatClH8psYmuw,AE80fzfVnsyeHiiVhwSGMw,1,2019-12-13 15:50:11,Deluxburger Express,phoenix,AZ,"Burgers, Restaurants",KJA805UOqiatClH8psYmuwAE80fzfVnsyeHiiVhwSGMw
2648856,hhAhkWco5iarESCUpMQJJQ,h-aIa6eOkcwZYGQz8Ss02w,WHiND6UPxr6WzgGhz2E91Q,5,2019-12-13 15:45:53,Hash Kitchen,chandler,AZ,"Breakfast & Brunch, Restaurants",h-aIa6eOkcwZYGQz8Ss02wWHiND6UPxr6WzgGhz2E91Q
7042961,5m1rgmC6D0PXn1UGbo6gJw,Ia4iRpMZP_mGyozlUz6Q7g,XJIs0pWVNu60fhehIXgaUg,5,2019-12-13 15:45:49,Advanced Endodontics,mesa,AZ,"Health & Medical, Endodontists, Dentists",Ia4iRpMZP_mGyozlUz6Q7gXJIs0pWVNu60fhehIXgaUg
5950973,QBTdmDhRNcbUGU2WV06PiA,8yWDN0YsJC65quUz6LDuNA,9MVKjEMN5T59uzG1xoD2BQ,5,2019-12-13 15:43:16,Cocina Madrigal,phoenix,AZ,"Restaurants, Mexican",8yWDN0YsJC65quUz6LDuNA9MVKjEMN5T59uzG1xoD2BQ


In [5]:
# binary classification for ROC-AUC
reviewframe['recommend'] = reviewframe['stars'].map({1:-1, 2:-1, 3:-1, 4:1, 5:1})

In [6]:
# produce a subset of users to finetune on
usercounts = reviewframe['user_id'].value_counts()
test_users = usercounts[(usercounts == 10)].index.tolist()

In [7]:
len(test_users)

5632

In [8]:
reviewframe_with_dummy = reviewframe.copy()

In [9]:
# creating a dummy user to attribute the new reviews to in finetuning
newuser = pd.DataFrame({'review_id':'fake_review', 'user_id':'new_user', 'business_id':'dummy_business', 'city':'Nowhere', 'state':'NW', 
                        'stars':3, 'date':pd.to_datetime('01-01-2020'), 'name':'dummy business', 'categories':'',
                        'repeat_review_id':'new_userdummy_business', 'recommend':-1,'cat_list':''}, index=[0])

In [10]:
# append the dummy user to the dataset
reviewframe_with_dummy = reviewframe_with_dummy.append(newuser, ignore_index=True)

In [11]:
# create two lightfm datasets, one for initial training and the other for finetuning.

pretrain_dataset = Dataset()
pretrain_dataset.fit(reviewframe_with_dummy.user_id,reviewframe_with_dummy.business_id)

retrain_dataset = Dataset()
retrain_dataset.fit(reviewframe_with_dummy.user_id,reviewframe_with_dummy.business_id)

In [12]:
print(pretrain_dataset.interactions_shape())
print(retrain_dataset.interactions_shape())

(606928, 60541)
(606928, 60541)


In [13]:
# one additional user and business in the lightfm datasets to attribute new users to.
print(reviewframe.user_id.nunique())
print(reviewframe.business_id.nunique())

606927
60540


In [14]:
# picking a user from the test_users list as a test case

user_to_exclude = test_users[57]

# find out how many reviews this user has made
included_user_frame = reviewframe[(reviewframe['user_id'] == user_to_exclude)].copy()
print(included_user_frame.shape)

# get all instances from the main dataframe that are not this user

excluded_user_frame = reviewframe[(reviewframe['user_id'] != user_to_exclude)].copy()

print(excluded_user_frame.shape)
excluded_user_frame.head()

(10, 11)
(2398702, 11)


Unnamed: 0,review_id,user_id,business_id,stars,date,name,city,state,categories,repeat_review_id,recommend
4786459,vRuKx3khNZZKM6nieI2nkw,r7-lUT62G15CXa9Wa6uvdA,hkNMo5kY8OhObwJF2qYIew,5,2019-12-13 15:51:19,Nirvana Center,phoenix,AZ,"Medical Centers, Shopping, Cannabis Dispensari...",r7-lUT62G15CXa9Wa6uvdAhkNMo5kY8OhObwJF2qYIew,1
4045013,HhRdgBVdTC3GzRszO79EIw,KJA805UOqiatClH8psYmuw,AE80fzfVnsyeHiiVhwSGMw,1,2019-12-13 15:50:11,Deluxburger Express,phoenix,AZ,"Burgers, Restaurants",KJA805UOqiatClH8psYmuwAE80fzfVnsyeHiiVhwSGMw,-1
2648856,hhAhkWco5iarESCUpMQJJQ,h-aIa6eOkcwZYGQz8Ss02w,WHiND6UPxr6WzgGhz2E91Q,5,2019-12-13 15:45:53,Hash Kitchen,chandler,AZ,"Breakfast & Brunch, Restaurants",h-aIa6eOkcwZYGQz8Ss02wWHiND6UPxr6WzgGhz2E91Q,1
7042961,5m1rgmC6D0PXn1UGbo6gJw,Ia4iRpMZP_mGyozlUz6Q7g,XJIs0pWVNu60fhehIXgaUg,5,2019-12-13 15:45:49,Advanced Endodontics,mesa,AZ,"Health & Medical, Endodontists, Dentists",Ia4iRpMZP_mGyozlUz6Q7gXJIs0pWVNu60fhehIXgaUg,1
5950973,QBTdmDhRNcbUGU2WV06PiA,8yWDN0YsJC65quUz6LDuNA,9MVKjEMN5T59uzG1xoD2BQ,5,2019-12-13 15:43:16,Cocina Madrigal,phoenix,AZ,"Restaurants, Mexican",8yWDN0YsJC65quUz6LDuNA9MVKjEMN5T59uzG1xoD2BQ,1


In [15]:
user_to_exclude

'h8Piwvy3-jj3PEYkggqqpg'

In [16]:
# build interactions for pretrain
(pretrain_interactions, weights) = pretrain_dataset.build_interactions([(x['user_id'],
                                                       x['business_id'],
                                                       x['recommend']) for index,x in excluded_user_frame.iterrows()])

In [17]:
# build interactions for retrain
(retrain_interactions, weights) = retrain_dataset.build_interactions([(x['user_id'],
                                                       x['business_id'],
                                                       x['recommend']) for index,x in included_user_frame.iterrows()])

In [24]:
seed = 42
from lightfm.cross_validation import random_train_test_split
train,test=random_train_test_split(retrain_interactions,test_percentage=0.3,random_state=np.random.RandomState(seed))

print(f'{train.shape[0]} users, {train.shape[1]} businesses, {test.getnnz()} test interactions, {train.getnnz()} train interactions')

606928 users, 60541 businesses, 3 test interactions, 7 train interactions


In [21]:
from lightfm import LightFM

NUM_THREADS = 4
NUM_COMPONENTS = 20  
NUM_EPOCHS = 20
learning_rate=0.05

model = LightFM(loss='logistic',random_state=seed,
               no_components=NUM_COMPONENTS,
               learning_rate=learning_rate)

In [22]:
# time it.
%time model.fit_partial(pretrain_interactions,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)

Wall time: 5min 50s


<lightfm.lightfm.LightFM at 0x25c1b4a8d08>

In [26]:
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('train AUC: %s' % train_auc)


test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
print('test AUC: %s' % test_auc)

train AUC: 0.9021612
test AUC: 0.8370886


In [25]:
%time model.fit_partial(train,epochs=1,num_threads=NUM_THREADS)

Wall time: 78.4 ms


<lightfm.lightfm.LightFM at 0x25c1b4a8d08>

In [28]:

from lightfm.evaluation import auc_score

train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('train AUC: %s' % train_auc)


test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
print('test AUC: %s' % test_auc)

train AUC: 0.83291
test AUC: 0.7223072
Train precision: 0.0000
Test precision: 0.0000


In [56]:
# finetune testing loop here.

# pretrain datasets and pretraining must be redone for each excluded user,
# so this takes a while. Sampling down is absolutely necessary without more horsepower.
sampled_users = sample(test_users,20)

train_auc_dict = {}
test_auc_dict = {}

for user in sampled_users:

    print(user)

    # make the dataset shapes for pretrain and retrain
    pretrain_dataset = Dataset()
    pretrain_dataset.fit(reviewframe_with_dummy.user_id,reviewframe_with_dummy.business_id)

    retrain_dataset = Dataset()
    retrain_dataset.fit(reviewframe_with_dummy.user_id,reviewframe_with_dummy.business_id)

    # picking a user from the test_users list as a test case
    user_to_exclude = user

    # Get the instances from the main dataframe that are this user
    included_user_frame = reviewframe[(reviewframe['user_id'] == user_to_exclude)].copy()

    # get all instances from the main dataframe that are not this user

    excluded_user_frame = reviewframe[(reviewframe['user_id'] != user_to_exclude)].copy()

    # build interactions for pretrain
    (pretrain_interactions, weights) = pretrain_dataset.build_interactions([(x['user_id'],
                                                                             x['business_id'],
                                                                             x['recommend']) for index,x in excluded_user_frame.iterrows()])

    # build interactions for retrain
    (retrain_interactions, weights) = retrain_dataset.build_interactions([(x['user_id'],
                                                                           x['business_id'],
                                                                           x['recommend']) for index,x in included_user_frame.iterrows()])

    # train test split
    seed = 42
    from lightfm.cross_validation import random_train_test_split
    train,test=random_train_test_split(retrain_interactions,test_percentage=0.3,random_state=np.random.RandomState(seed))

    
    NUM_THREADS = 4
    NUM_COMPONENTS = 20  
    NUM_EPOCHS = 60
    learning_rate=0.05
    k = 5

    model = LightFM(loss='logistic',random_state=seed,
                no_components=NUM_COMPONENTS,
                learning_rate=learning_rate)

    train_auc_scores = []
    test_auc_scores = []
    train_precision_scores = []
    test_precision_scores = []

    # pretrain
    model.fit_partial(pretrain_interactions,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)


    print("pretrain scores")
    train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
    print('train AUC: %s' % train_auc)


    test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
    print('test AUC: %s' % test_auc)

    train_auc_scores.append(train_auc)
    test_auc_scores.append(test_auc)


    # retrain
    for ii in range(5,100,5):
        print("retraining, epoch "+str(ii))
        model.fit_partial(train,epochs=5,num_threads=NUM_THREADS)

        train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
        print("retrain scores, epoch "+str(ii))
        print('train AUC: %s' % train_auc)


        test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
        print('test AUC: %s' % test_auc)

        train_auc_scores.append(train_auc)
        test_auc_scores.append(test_auc)

    # add user data to dicts

    train_auc_dict[user] = train_auc_scores
    test_auc_dict[user] = test_auc_scores




0.81511337
retraining, epoch 55
retrain scores, epoch 55
train AUC: 0.8263503
test AUC: 0.8151409
retraining, epoch 60
retrain scores, epoch 60
train AUC: 0.8268718
test AUC: 0.815185
retraining, epoch 65
retrain scores, epoch 65
train AUC: 0.8274075
test AUC: 0.81523997
retraining, epoch 70
retrain scores, epoch 70
train AUC: 0.8279644
test AUC: 0.8153005
retraining, epoch 75
retrain scores, epoch 75
train AUC: 0.82854736
test AUC: 0.81533915
retraining, epoch 80
retrain scores, epoch 80
train AUC: 0.8290476
test AUC: 0.8154052
retraining, epoch 85
retrain scores, epoch 85
train AUC: 0.82954794
test AUC: 0.8154052
retraining, epoch 90
retrain scores, epoch 90
train AUC: 0.83001053
test AUC: 0.81544924
retraining, epoch 95
retrain scores, epoch 95
train AUC: 0.83050144
test AUC: 0.8154988
uvIBi2jdvOUh2hSLdo6gyg
pretrain scores
train AUC: 0.8290571
test AUC: 0.9493926
retraining, epoch 5
retrain scores, epoch 5
train AUC: 0.8301969
test AUC: 0.9494422
retraining, epoch 10
retrain scores

In [57]:
auc_frame = pd.DataFrame(test_auc_dict)

In [58]:
auc_frame.head()

Unnamed: 0,owkB0tTcymYOV3hpBOIBOg,jDQbBGKsmLpfFJdhpTYJfQ,1l1QdJ_wF4UhKixWfNaHxg,gP3jtJLckJdvPcq_VzgI1Q,uiZMpQSqJ4GA9PGv2iJpAg,KB9bFua_MDbal_UR86efIw,Z4ilg26RHQjGuslq9dcbGA,zFljuKhYpeA1WWU8rjCUiA,PL3h11VXhMwadl79Ndf6OA,uvIBi2jdvOUh2hSLdo6gyg,f_8GbUU7AcBDMgQVEtvFiA,zvcbwNPGYxRxD1I7GHNxCg,sfO3k8V_ASRI4XcTdmHeyA,YBouLEgk10yX5sem_yzlog,cf7VRIvOFEToSrNTPtQS6g,Na4ZSBgUmj77SLhUTH-LIw,OADwxJqXxuI30aB8HvI9QA,jsBrvT_u2Ufbgjv_1nKg5Q,9dnRlqJM3wp6IDrRjD4vZQ,Lo6o639Hw86HJCnAs-HRKQ
0,0.794933,0.989803,0.79702,0.97242,0.824331,0.937356,0.747184,0.5155,0.814276,0.949393,0.871954,0.965801,0.916691,0.957707,0.910425,0.962453,0.850199,0.869146,0.775546,0.994158
1,0.794911,0.989803,0.79707,0.972414,0.82421,0.937356,0.747189,0.514971,0.814364,0.949442,0.872135,0.965763,0.916702,0.957712,0.910469,0.962464,0.850199,0.86909,0.775634,0.994158
2,0.7949,0.989803,0.797086,0.972414,0.824143,0.937351,0.747228,0.514459,0.81448,0.949453,0.872218,0.96573,0.916769,0.957696,0.910536,0.96247,0.850204,0.869041,0.775656,0.994152
3,0.794906,0.989797,0.797136,0.972398,0.824099,0.937373,0.74731,0.514189,0.814596,0.949503,0.8724,0.965697,0.916796,0.957696,0.910602,0.96247,0.850226,0.869002,0.775711,0.994147
4,0.794889,0.989797,0.797141,0.972392,0.824044,0.937406,0.747327,0.513837,0.814706,0.949536,0.87251,0.96568,0.916796,0.957696,0.910624,0.96247,0.850232,0.868936,0.775761,0.994152


In [59]:
auc_frame['roc_avg'] = auc_frame.mean(axis=1)

In [60]:
auc_frame['roc_avg'].max()

0.8708147466182709

In [61]:
# ROC score per five epochs of finetuning
auc_frame['roc_avg']

0     0.870815
1     0.870803
2     0.870789
3     0.870803
4     0.870797
5     0.870788
6     0.870787
7     0.870772
8     0.870769
9     0.870753
10    0.870741
11    0.870732
12    0.870718
13    0.870715
14    0.870701
15    0.870682
16    0.870670
17    0.870654
18    0.870636
19    0.870626
Name: roc_avg, dtype: float64