In [6]:
import lightfm
import scipy as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightfm.data import Dataset
from sklearn.model_selection import train_test_split
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from scipy.sparse import csr_matrix

In [9]:
import os
os.listdir("FilteredData")

['ReadInCleanedData.ipynb',
 'review_philly.feather',
 'DataCleaning.ipynb',
 'business_philly.feather',
 'user_philly.feather']

In [15]:
business = pd.read_csv("FilteredData/business_philly.csv")
review = pd.read_csv("FilteredData/review_philly.csv")
user = pd.read_csv("FilteredData/user_philly.csv")
# business = pd.read_feather('FilteredData/business_philly.feather')
# review = pd.read_feather('FilteredData/review_philly.feather')
# user = pd.read_feather('FilteredData/user_philly.feather')

Prepare the data

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:6a40739a6c532446494250258998c7a7d3c...
1,size 648216229


In [24]:
# distilbert = pd.read_csv('Sentiment Analysis/distilbert/sentiment_results_distilbert.csv')
# distilbert.head()
# sns.boxplot(x = 'stars',y='score',data = distilbert)
# plt.show()
# sns.violinplot(x = 'stars',y='score',data = distilbert)
# plt.show()

### Experiment 1: AUC of full philly dataset

In [25]:
dataset = Dataset()
dataset.fit(
    (row['user_id'] for _, row in review.iterrows()),
    (row['business_id'] for _, row in review.iterrows()))

In [26]:
# Add interactions
train, test = train_test_split(review, test_size=0.2)
# Build test interactions
test_interactions, test_weights = dataset.build_interactions(
    ((row['user_id'], row['business_id'], row['stars']) for _, row in test.iterrows()))
# Build train interactions
train_interactions, train_weights = dataset.build_interactions(
    ((row['user_id'], row['business_id'], row['stars']) for _, row in train.iterrows()))

In [27]:
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# Instantiate and train the model
model = lightfm.LightFM()
model.fit(train_interactions, epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x7f7ba6213700>

In [28]:
%%time
train_auc = auc_score(model, test_interactions).mean()
print(f'Train AUC: {train_auc:.2f}')

Train AUC: 0.85
CPU times: user 58 s, sys: 35.4 ms, total: 58 s
Wall time: 58.1 s


## Include Categories

In [29]:
# Read dilbert data
##time
distilbert = pd.read_csv('Sentiment Analysis/distilbert/sentiment_results_distilbert.csv')

In [30]:
business.head()

Unnamed: 0,index,_id,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,31,631ea3b2a5cde8cc0d6eec47,-0M0b-XhtFagyLmsBtOe8w,Paris Wine Bar,2303 Fairmount Ave,Philadelphia,PA,19130.0,39.967439,-75.175452,3.5,18,0,"{'Alcohol': ""u'full_bar'"", 'OutdoorSeating': '...","Bars, Nightlife, Restaurants, French, Wine Bars","{'Thursday': '17:0-0:0', 'Friday': '17:0-0:0',..."
1,32,631ea3b0a5cde8cc0d6dfa60,-0PN_KFPtbnLQZEeb23XiA,Mr Wong's Chinese Restaurant,1849 Wolf St,Philadelphia,PA,19145.0,39.923048,-75.178078,3.5,9,0,"{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Restaurants, Chinese","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
2,33,631ea3aea5cde8cc0d6d5a50,-0TffRSXXIlBYVbb5AwfTg,IndeBlue Modern Indian Food & Spirits,205 South 13th St,Philadelphia,PA,19107.0,39.948508,-75.161969,4.5,1097,1,"{'RestaurantsReservations': 'True', 'NoiseLeve...","Cocktail Bars, Food Delivery Services, Nightli...","{'Monday': '0:0-0:0', 'Tuesday': '16:0-22:0', ..."
3,37,631ea3b0a5cde8cc0d6e2ef1,-0eUa8TsXFFy0FCxHYmrjg,Waterfront Gourmet Cafe & Deli,3131 Walnut St,Philadelphia,PA,19104.0,39.952446,-75.187321,4.0,26,0,"{'BikeParking': 'True', 'RestaurantsGoodForGro...","Caterers, Sandwiches, Delis, Restaurants, Cafe...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,40,631ea3ada5cde8cc0d6d26c0,-0fvhILrC9UsQ6gLNpZlTQ,David's Southern Fried Pies,8601 Frankford Ave,Philadelphia,PA,19136.0,40.046191,-75.01509,4.5,18,0,"{'BusinessAcceptsBitcoin': 'False', 'Caters': ...","Desserts, Food","{'Monday': '0:0-0:0', 'Tuesday': '12:0-19:0', ..."


In [35]:
# Create set of all categories
all_cats = set()
cat_list = business.categories.apply(lambda x: x.split(", "))
for row in cat_list:
    for cat in row:
        all_cats.add(cat)
print(f'Found {len(all_cats)} unique categories')

Found 800 unique categories


In [36]:
# Create a list of dictionaries
## Following: https://github.com/kapadias/mediumposts/blob/master/recommender/published_notebooks/recommendation_python_lightfm.ipynb
business_category_list = []

for i in range(len(cat_list)):
    this_biz = {}
    for cat in all_cats:
        contains = cat in cat_list[i]
        this_biz[cat] = contains
    business_category_list.append(this_biz)
business_cat_df = pd.DataFrame(business_category_list)
business_cat_df.head()

Unnamed: 0,Burgers,Event Planning & Services,Carpeting,Pilates,Home Staging,Fertility,Preschools,Home Theatre Installation,Urologists,French,...,Cards & Stationery,Venezuelan,Poke,Pool Halls,Makeup Artists,Safety Equipment,Honey,Video Game Stores,Accessories,Community Service/Non-Profit
0,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [37]:
# Add bookid and back to original then turn to csr matrix
business_cat_df['business_id'] = business['business_id']
business_cat_df = business_cat_df[['business_id'] + list(all_cats)]
business_cat_df.head()

Unnamed: 0,business_id,Burgers,Event Planning & Services,Carpeting,Pilates,Home Staging,Fertility,Preschools,Home Theatre Installation,Urologists,...,Cards & Stationery,Venezuelan,Poke,Pool Halls,Makeup Artists,Safety Equipment,Honey,Video Game Stores,Accessories,Community Service/Non-Profit
0,-0M0b-XhtFagyLmsBtOe8w,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,-0PN_KFPtbnLQZEeb23XiA,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,-0TffRSXXIlBYVbb5AwfTg,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,-0eUa8TsXFFy0FCxHYmrjg,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,-0fvhILrC9UsQ6gLNpZlTQ,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [38]:
business_cat_df.set_index('business_id', inplace=True)
business_cat_matrix = csr_matrix(business_cat_df)
business_cat_matrix

<10391x800 sparse matrix of type '<class 'numpy.bool_'>'
	with 47792 stored elements in Compressed Sparse Row format>

In [39]:
# Now use the business category matrix to create a new model
# With business categories as features of items
model2 = lightfm.LightFM()
model2.fit(train_interactions, item_features=business_cat_matrix, epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x7f7baf4a2490>

In [40]:
# Get new AUC
train_auc = auc_score(model2, test_interactions, item_features=business_cat_matrix).mean()
print(f'Train AUC with Features: {train_auc:.2f}')

Train AUC with Features: 0.49
