In [1]:
import random
import pandas as pd
import numpy as np
import sklearn
from collections import defaultdict
from string import digits
import string
from nltk.stem.porter import *
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [2]:
data = pd.read_csv('~/data/yelp/yelp_reviews.csv')
data = data.sample(n=100000, replace=False) 

In [3]:
data = ((data[['user_id', 'business_id', 'city', 'stars_x', 'text', 'date', 'name_x', 'name_y', 'stars_y', 'review_count_y', 'is_open', 'attributes', 'categories', 'hours']])
        .rename(columns={'stars_x': 'review_rating', 'name_x':'user_name', 'name_y': 'restaurant_name', 'review_count_y':'num_reviews', 'stars_y':'restaurant_rating'}))

In [33]:
data['popularity'] = data['num_reviews'] /  data['restaurant_rating']

In [5]:
all_categories = defaultdict(int)
def clean_text(text):
    punct = string.punctuation
    text = text.replace("\n", ' ').replace("\t", ' ').lower().strip()
    text = [c for c in text if not (c in punct)]
    text = ''.join(text)
    text = text.strip().replace("  ",' ')
    for i in text.split():
        all_categories[i] += 1
    return text

In [6]:
data['categories'] = data['categories'].astype(str)
data['categories'] = data['categories'].apply(clean_text)

In [7]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [8]:
#catDf = pd.DataFrame(np.zeros([len(data), len(all_categories)]), columns=all_categories)

In [9]:
## LONG ~5 MIN ##
# for i in range(len(data)):
#     for j in data.iloc[i]['categories'].split():
#         catDf.iloc[i][j] = 1

In [10]:
# subData = pd.concat([data[['restaurant_rating', 'num_reviews', 'is_open', 'popularity', 'review_rating']], catDf], axis=1).astype(np.float32)

In [11]:
trainData, testData = sklearn.model_selection.train_test_split(data)

In [12]:
X_train = (trainData.drop(columns=['review_rating']))
y_train = (trainData['review_rating'])

X_test = (testData.drop(columns=['review_rating']))
y_test = (testData['review_rating'])


In [13]:

interactions = []

userIDs = {}
itemIDs = {}
for row in range(len(data)):
    u = data.iloc[row]['user_id']
    i = data.iloc[row]['business_id']

    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    
    
    interactions.append((u, i))

In [14]:
# userIDs = set(X_train['user_id'].unique())
# itemIDs = set(X_train['business_id'].unique())

In [15]:
items = list(itemIDs.keys())

In [16]:
optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(10, 0.00001)

In [17]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)

for row in range(len(X_train)):
    u = X_train.iloc[row]['user_id']
    i = X_train.iloc[row]['business_id']
    
    usersPerItem[i].append(u)
    itemsPerUser[u].append(i)

In [18]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.62193567
iteration 20, objective = 0.6075486
iteration 30, objective = 0.6085353
iteration 40, objective = 0.6105618
iteration 50, objective = 0.61801547
iteration 60, objective = 0.62089276
iteration 70, objective = 0.6269419
iteration 80, objective = 0.6263723
iteration 90, objective = 0.62747115
iteration 100, objective = 0.6275623


In [19]:
interactionsTestPerUser = defaultdict(set)
itemSet = set()
for row in range(len(X_test)):
    u = X_test.iloc[row]['user_id']
    i = X_test.iloc[row]['business_id']

    interactionsTestPerUser[u].add(i)
    itemSet.add(i)

In [21]:
itemsPerUser[data['user_id'].iloc[0]]

['hnMdGajgg2083rWrxDb50g', 'rkAxZDtaYtUIvllAub0P6w']

In [25]:
# userCity = defaultdict(str)
# tracker = 0
# for user in data['user_id'].unique():
#     businesses = []
#     for item in itemsPerUser[user]:
#         business = data.loc[data['business_id']==item]['city'].unique()
#         businesses.append(business)
#     if len(businesses) != 0:
#         userCity[user] = pd.Series(businesses).value_counts().idxmax().item()
#     else:
#         userCity[user] = np.random.choice(data['city'].unique())
    
#     if tracker % 10000 == 0:
#         print(i)
#     tracker += 1


OgJ0KxwJcJ9R5bUK0ixCbg


KeyboardInterrupt: 

In [68]:
def recommend(model, user, N):
    outputs = []

    if user not in data['user_id'].unique():
        outputs = data.sort_values('popularity', ascending=False)['restaurant_name'].unique()[:N]
        print('cold start')
    
    else:
        preds = []
        for i in data['business_id'].unique():
            pred = model.predict(userIDs[user], itemIDs[i]).numpy()
            preds.append((i, pred))

        preds.sort(key=lambda x: x[1])
        preds.reverse()

        
        for i, j in preds[:N]:
            output = data[data['business_id'] == i]['restaurant_name'].unique()
            outputs.append(output.item())
        print('warm start')

    return outputs
    

In [21]:
def AUCu(model, u, N):
    win = 0
    if N > len(interactionsTestPerUser[u]):
        N = len(interactionsTestPerUser[u])
    positive = random.sample(interactionsTestPerUser[u],N)
    negative = random.sample(itemSet.difference(interactionsTestPerUser[u]),N)
    for i,j in zip(positive,negative):
        si = model.predict(userIDs[u], itemIDs[i]).numpy()
        sj = model.predict(userIDs[u], itemIDs[j]).numpy()
        if si > sj:
            win += 1
    return win/N

In [22]:
def AUC(model):
    av = []
    for u in interactionsTestPerUser:
        av.append(AUCu(model, u, 10))
    return sum(av) / len(av)

In [23]:
## TOOK 16 MINUTES ##
AUC(modelBPR)

0.701535561160443

In [81]:
preds = []
for i in X_test['business_id'].unique():
    pred = modelBPR.predict(userIDs['tWevXZppIcLLuF62pLDb-A'], itemIDs[i]).numpy()
    preds.append((i, pred))

In [82]:
preds.sort(key=lambda x: x[1])
preds.reverse()

In [26]:
preds

NameError: name 'preds' is not defined

In [93]:
X_test[X_test['business_id'] == '2weQS-RnoOBhb1KsHKyoSQ']['categories'].iloc[0]

'food buffets event planning services restaurants beauty spas hotels casinos arts entertainment hotels travel day spas breakfast brunch'

In [86]:
X_test[X_test['user_id'] == 'tWevXZppIcLLuF62pLDb-A']['categories']

466415    restaurants sushi bars asian fusion japanese
Name: categories, dtype: object