In [1]:
# # Read in data
import pandas as pd
import numpy as np
# Import important lightfm stuff
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score,reciprocal_rank
from lightfm.data import Dataset
from lightfm import LightFM, cross_validation



In [2]:
philly_bus = pd.read_feather('FilteredData/business_philly.feather')
philly_reviews = pd.read_feather('FilteredData/review_philly.feather')
philly_users = pd.read_feather('FilteredData/user_philly.feather')

In [3]:
# Sample data down to less reviews
# philly_reviews = philly_reviews_orig.sample(n=100000, random_state=42)
# print(f'Total Reviews: {len(philly_reviews):,}')

In [4]:
# Create lightfm dataset
dataset = Dataset()
dataset.fit(
    philly_reviews['user_id'].unique(),
    philly_reviews['business_id'].unique()
)
(interactions, weights) = dataset.build_interactions(
    (row['user_id'], row['business_id']) for index, row in philly_reviews.iterrows()
)
train, test = cross_validation.random_train_test_split(
    interactions, test_percentage=0.25, random_state=np.random.RandomState(42))
print(f'Number of training interactions: {train.nnz:,}')
print(f'Number of test interactions: {test.nnz:,}')

Number of training interactions: 641,310
Number of test interactions: 213,770


In [5]:
# Create subset for training
train_subset, test_subset = cross_validation.random_train_test_split(
    train, test_percentage=0.1, random_state=np.random.RandomState(42)
    )
print(f'Number of training subset interactions: {train_subset.nnz:,}')
print(f'Number of test subset interactions: {test_subset.nnz:,}')

Number of training subset interactions: 577,179
Number of test subset interactions: 64,131


In [6]:
# # No luck getting this to work

# # Set up differential evolution
# from scipy.optimize import differential_evolution
# from lightfm.evaluation import precision_at_k
# from lightfm.evaluation import auc_score

# def hyperparameter_tuning(hyperparameters):
#     print(f'Creating new model with hyperparameters: {hyperparameters}')
#     model = LightFM(
#         no_components=int(hyperparameters[0]),
#         loss='warp',
#         learning_rate=hyperparameters[1],
#         random_state=42
#     )
#     print('Fitting model')
#     model.fit(train, epochs=10)
#     test_auc = auc_score(model, test).mean()
#     return (1- test_auc)

# bounds = [(5, 100), # no_components
#             (1e-5, 1e-1), # learning_rate
# ]

In [7]:
# Set up grid search
from sklearn.model_selection import ParameterGrid
from lightfm.evaluation import auc_score
# Create grid for hyperparameter search
param_grid = {
    'no_components': [5, 10, 20, 50, 100],
    'learning_rate': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}
# Create grid search
grid = ParameterGrid(param_grid)
# Create empty list to store results
results = []

In [8]:
# Function to create and fit model
# And return AUC score
# So that we can multithread this
def create_and_fit_model(params):
    model = LightFM(
        no_components=params['no_components'],
        loss='warp',
        learning_rate=params['learning_rate'],
        random_state=42
    )
    model.fit(train_subset, epochs=10)
    test_auc = auc_score(model, test_subset).mean()
    return (test_auc, params)

In [9]:
# Test function
create_and_fit_model({'no_components': 5, 'learning_rate': 1e-5})

(0.53296894, {'no_components': 5, 'learning_rate': 1e-05})

In [11]:
%%time
# Train over grid
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(create_and_fit_model)(params) for params in grid)


In [None]:
# Unnest params in results
results = [(auc, params['no_components'], params['learning_rate']) for auc, params in results]

ValueError: too many values to unpack (expected 2)

In [None]:
pd.DataFrame(results, columns = ['auc_score', 'no_components','learning_rate']).sort_values('auc_score', ascending=False)

Unnamed: 0,auc_score,no_components,learning_rate
19,0.799098,100,0.01
18,0.797359,50,0.01
14,0.797255,100,0.001
9,0.797188,100,0.0001
15,0.797017,5,0.01
16,0.796832,10,0.01
13,0.796424,50,0.001
17,0.796266,20,0.01
12,0.795909,20,0.001
11,0.795351,10,0.001


In [None]:
# Use genetic algorithm to find best hyperparameters
from tpot import TPOTClassifier
# Use same train and test data as before
# Sample train down to .1 of original size
train = train.tocsr()
train = train[:int(len(train)/10)]
train = train.tocoo()
# Use Genetic Algorithm to find best hyperparameters
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    

Test AUC: 0.7991
