In [1]:
# Code from Carolina's notebook 
import pandas as pd
import numpy as np
import tarfile
import json
philly_bus = pd.read_feather('../FilteredData/business_philly.feather')
philly_reviews = pd.read_feather('../FilteredData/review_philly.feather')
philly_users = pd.read_feather('../FilteredData/user_philly.feather')
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score,reciprocal_rank
from lightfm.data import Dataset
from lightfm import LightFM, cross_validation
df = philly_reviews.groupby('user_id')['stars'].mean()
users = pd.merge(philly_users, df, on=['user_id'], how='left')
bins = [0, 0.9999999, 1.9999999, 2.9999999, 3.9999999, 4.9999999, 5]
labels = ["0","1", "2", "3","4", "5"]
users["star_bin"] = pd.cut(users['stars'], bins=bins, labels=labels)
reviews_only = philly_reviews[["user_id", "business_id", "stars"]]
#unique user features
user_f = []
user_col = ['star_bin']*len(users['star_bin'].unique()) 
user_unique_list = list(users['star_bin'].unique())
# col = ['review_count']*len(users['review_count'].unique()) + ['useful']*len(users['useful'].unique()) + ['funny']*len(users['funny'].unique()) + ['cool']*len(users['cool'].unique())
# unique_list = list(users['review_count'].unique()) + list(users['useful'].unique()) + list(users['funny'].unique()) + list(users['cool'].unique())


for x,y in zip(user_col, user_unique_list):
    res = str(x)+ ":" +str(y)
    user_f.append(res)

item_f = []
item_col = ['stars']*len(philly_bus['stars'].unique()) + ['postal_code']*len(philly_bus['postal_code'].unique())
item_unique_list = list(philly_bus['stars'].unique()) + list(philly_bus['postal_code'].unique())

for x,y in zip(item_col, item_unique_list):
    res = str(x)+ ":" +str(y)
    item_f.append(res)
dataset1 = Dataset()
dataset1.fit(
        philly_reviews['user_id'].unique(), # all the users
        philly_reviews['business_id'].unique(), # all the items
        user_features = user_f, # additional user features
        item_features = item_f #additional item features
)
(interactions, weights) = dataset1.build_interactions([(x[0], x[1], x[2]) for x in reviews_only.values])
train, test = cross_validation.random_train_test_split(interactions, test_percentage=0.25, random_state=np.random.RandomState(42))
model = LightFM()
model.fit(train,
      epochs=30,
      num_threads=4)
from lightfm.evaluation import auc_score



In [2]:
# Sample training set down to 10% of the original size
train_small, test_small = cross_validation.random_train_test_split(interactions, test_percentage=0.25, random_state=np.random.RandomState(42))
_, train_tiny = cross_validation.random_train_test_split(train_small, test_percentage=0.1, random_state=np.random.RandomState(42))
# Check that it works still 
# model = LightFM()
# model.fit(train_tiny,
#         epochs=30,
#         num_threads=4)
# test_auc = auc_score(model,
#                         test_small,
#                           ).mean()
# print('Hybrid training set AUC: %s' % test_auc)

In [3]:
# Parameters to tune
epochs = [10, 20, 30] # number of epochs
no_components = [5, 10, 20] # Dimensionality of the latent feature vectors
loss_func = ['warp', 'bpr', 'logistic'] # Loss function
learning_rate = [0.001, 0.01, 0.1] # Adagrad learning rate
random_state = [42] # Random state
# Create a dictionary of all the parameter options
params = {'epochs': epochs,
            'no_components': no_components,
            'loss': loss_func,
            'learning_rate': learning_rate,
            'random_state': random_state}

In [4]:
# Function to create, train, and evaluate a model
def create_model(train, test, epochs, no_components, loss_func, learning_rate, random_state):
    model = LightFM(learning_rate=learning_rate, loss=loss_func, no_components=no_components, random_state=random_state)
    model.fit(train,
              epochs=epochs,
              num_threads=4)
    test_auc = auc_score(model,
                         test,
                         ).mean()
    # Write params and AUC to file
    file_name = f'Epochs_{epochs}_Components_{no_components}_Loss_{loss_func}_LearningRate_{learning_rate}_RandomState_{random_state}.txt'
    with open(file_name, 'w') as f:
        f.write(f'Epochs: {epochs}, Components: {no_components}, Loss: {loss_func}, Learning Rate: {learning_rate}, Random State: {random_state}, AUC: {test_auc}')
    return test_auc

In [5]:
# Create grid to search over with all combinations of parameters
from itertools import product
grid = list(product(epochs, no_components, loss_func, learning_rate, random_state))
# Create list to store results
results = []
# Loop through each combination of parameters
# Using multiprocessing to speed up the process
from multiprocessing import Pool
with Pool(16) as p:
    for result in p.starmap(create_model, [(train_tiny, train_tiny, *x) for x in grid[0:2]]):
        results.append(result)