In [1]:
# Read in data
import pandas as pd
import numpy as np

## Read in Data

In [41]:
philly_bus = pd.read_feather('FilteredData/business_philly.feather')
philly_reviews = pd.read_feather('FilteredData/review_philly.feather')
philly_users = pd.read_feather('FilteredData/user_philly.feather')

In [47]:
philly_bus.head()

Unnamed: 0,index,_id,business_id,name,address,city,state,postal_code,latitude,longitude,...,review_count,is_open,attributes,categories,hours,positive_%,bucketed_average_stars_received,bucketed_sentiment_scores_received,bucketed_review_count,categories_2
0,31,631ea3b2a5cde8cc0d6eec47,-0M0b-XhtFagyLmsBtOe8w,Paris Wine Bar,2303 Fairmount Ave,Philadelphia,PA,19130,39.967439,-75.175452,...,18,0,"{'Alcohol': ""u'full_bar'"", 'OutdoorSeating': '...","Bars, Nightlife, Restaurants, French, Wine Bars","{'Thursday': '17:0-0:0', 'Friday': '17:0-0:0',...",0.722222,more_than_3_up_to_4,more_than_60_up_to_80_percent,more_than_10_up_to_25,"[B, a, r, s, ,, , N, i, g, h, t, l, i, f, e, ,..."
1,32,631ea3b0a5cde8cc0d6dfa60,-0PN_KFPtbnLQZEeb23XiA,Mr Wong's Chinese Restaurant,1849 Wolf St,Philadelphia,PA,19145,39.923048,-75.178078,...,9,0,"{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Restaurants, Chinese","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",0.636364,more_than_3_up_to_4,more_than_60_up_to_80_percent,more_than_1_up_to_10,"[R, e, s, t, a, u, r, a, n, t, s, ,, , C, h, i..."
2,33,631ea3aea5cde8cc0d6d5a50,-0TffRSXXIlBYVbb5AwfTg,IndeBlue Modern Indian Food & Spirits,205 South 13th St,Philadelphia,PA,19107,39.948508,-75.161969,...,1097,1,"{'RestaurantsReservations': 'True', 'NoiseLeve...","Cocktail Bars, Food Delivery Services, Nightli...","{'Monday': '0:0-0:0', 'Tuesday': '16:0-22:0', ...",0.87478,more_than_4,more_than_80_percent,more_than_100,"[C, o, c, k, t, a, i, l, , B, a, r, s, ,, , F,..."
3,37,631ea3b0a5cde8cc0d6e2ef1,-0eUa8TsXFFy0FCxHYmrjg,Waterfront Gourmet Cafe & Deli,3131 Walnut St,Philadelphia,PA,19104,39.952446,-75.187321,...,26,0,"{'BikeParking': 'True', 'RestaurantsGoodForGro...","Caterers, Sandwiches, Delis, Restaurants, Cafe...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",0.821429,more_than_3_up_to_4,more_than_80_percent,more_than_25_up_to_50,"[C, a, t, e, r, e, r, s, ,, , S, a, n, d, w, i..."
4,40,631ea3ada5cde8cc0d6d26c0,-0fvhILrC9UsQ6gLNpZlTQ,David's Southern Fried Pies,8601 Frankford Ave,Philadelphia,PA,19136,40.046191,-75.01509,...,18,0,"{'BusinessAcceptsBitcoin': 'False', 'Caters': ...","Desserts, Food","{'Monday': '0:0-0:0', 'Tuesday': '12:0-19:0', ...",0.894737,more_than_4,more_than_80_percent,more_than_10_up_to_25,"[D, e, s, s, e, r, t, s, ,, , F, o, o, d]"


In [48]:
# Import important lightfm stuff
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score,reciprocal_rank
from lightfm.data import Dataset
from lightfm import LightFM, cross_validation

In [59]:
# Turn categories_2 into a list of lists
philly_bus['categories_2'] = philly_bus['categories'].str.split(', ')
philly_bus['categories_2'] = philly_bus['categories_2'].fillna('')
# philly_bus['categories_2'] = philly_bus['categories'].apply(lambda x: [i.strip() for i in x])
# # Turn categories into a set of binary columns
philly_bus_categories = philly_bus.join(philly_bus['categories_2'].str.join('|').str.get_dummies())

In [60]:
philly_bus.iloc[0]['categories_2'] # Check that it worked

['Bars', 'Nightlife', 'Restaurants', 'French', 'Wine Bars']

In [61]:
# Get number of columns in philly_bus2
len(philly_bus_categories.columns) # 820

821

In [62]:
# Get number of unique categories
len(philly_bus['categories_2'].explode().unique()) 

800

In [65]:
# Create lightfm dataset
dataset = Dataset()
dataset.fit(
    philly_reviews['user_id'].unique(),
    philly_reviews['business_id'].unique(),
    item_features=philly_bus_categories.iloc[:, 20:] # Only use the binary columns
)
(interactions, weights) = dataset.build_interactions(
    (row['user_id'], row['business_id']) for index, row in philly_reviews.iterrows()
)

In [68]:
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.25, random_state=np.random.RandomState(42))
model = LightFM()
model.fit(train, epochs=30, num_threads=4) # Train the model

<lightfm.lightfm.LightFM at 0x2329138b130>

In [69]:
from lightfm.evaluation import auc_score
test_auc = auc_score(
    model,
    test,
).mean() # Calculate AUC score
print('AUC: %s' % test_auc) # Print AUC score

AUC: 0.84577495


In [70]:
# Tweak the model to improve AUC score
model = LightFM(loss='warp') # Weighted Approximate-Rank Pairwise
model.fit(train, epochs=30, num_threads=4) # Train the model
test_auc = auc_score(
    model,
    test,
).mean() # Calculate AUC score
print('AUC: %s' % test_auc) # Print AUC score

AUC: 0.8411865


In [71]:
# Change number of components to see if it improves AUC score
component_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] # List of number of components to try
for i in component_list:
    model = LightFM(loss='warp', no_components=i)
    model.fit(train, epochs=30, num_threads=4) # Train the model
    test_auc = auc_score(
        model,
        test,
    ).mean() # Calculate AUC score
    print('AUC: %s' % test_auc) # Print AUC score
    print('Number of components: %s' % i) # Print number of components
    print('') # Print blank line

AUC: 0.840174
Number of components: 10

AUC: 0.84201723
Number of components: 20

AUC: 0.84220904
Number of components: 30

AUC: 0.84134126
Number of components: 40

AUC: 0.8427962
Number of components: 50

AUC: 0.84230494
Number of components: 60

AUC: 0.8430825
Number of components: 70



KeyboardInterrupt: 

In [73]:
# Try different loss functions: logistic, bpr, warp-kos, warp
loss_list = ['logistic', 'bpr', 'warp-kos', 'warp'] # List of loss functions to try
for i in loss_list:
    model = LightFM(loss=i, no_components=30)
    model.fit(train, epochs=30, num_threads=4) # Train the model
    test_auc = auc_score(
        model,
        test,
    ).mean() # Calculate AUC score
    print('AUC: %s' % test_auc) # Print AUC score
    print('Loss function: %s' % i) # Print loss function
    print('') # Print blank line

AUC: 0.84573644
Loss function: logistic

AUC: 0.54337627
Loss function: bpr

AUC: 0.8216907
Loss function: warp-kos

AUC: 0.841579
Loss function: warp



In [75]:
# Try differeknt values for k: 1, 5, 10, 15, 20
k_list = [1, 5, 10, 15, 20] # List of k values to try
for i in k_list:
    model = LightFM(loss='warp', no_components=30, k=i)
    model.fit(train, epochs=30, num_threads=4) # Train the model
    test_auc = auc_score(
        model,
        test
    ).mean() # Calculate AUC score
    print('AUC: %s' % test_auc) # Print AUC score
    print('k: %s' % i) # Print k
    print('') # Print blank line

AUC: 0.84202635
k: 1

AUC: 0.84176177
k: 5

AUC: 0.8410602
k: 10

AUC: 0.8407556
k: 15

AUC: 0.84142286
k: 20

