# Similarity-Based Link Prediction on Yelp Recommender

In [15]:
from random import choices
from itertools import combinations

import numpy as np
import pandas as pd

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Import

In [3]:
busi_df = pd.read_feather("data/yelp_business_cleaned.feather")
print(f"There are {busi_df.shape[0]} business records with {busi_df.shape[1]} total features.")
display(busi_df.head(3))
busi_df[['stars', 'review_count']].describe()

There are 68054 business records with 84 total features.


Unnamed: 0,original_index,business_id,name,address,city,state,postal_code,latitude,longitude,stars,...,music_live,music_video,music_karaoke,hours_Monday,hours_Tuesday,hours_Wednesday,hours_Thursday,hours_Friday,hours_Saturday,hours_Sunday
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,...,,,,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
1,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,...,,,,closed,closed,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0,12:0-18:0
2,5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,...,,,,0:0-0:0,6:0-22:0,6:0-22:0,6:0-22:0,9:0-0:0,9:0-22:0,8:0-22:0


Unnamed: 0,stars,review_count
count,68054.0,68054.0
mean,3.563949,74.619861
std,0.851437,169.900997
min,1.0,5.0
25%,3.0,11.0
50%,3.5,27.0
75%,4.0,73.0
max,5.0,7568.0


In [5]:
review_df = pd.read_feather("data/yelp_review.feather")
review_df = review_df.loc[review_df['business_id'].isin(busi_df['business_id'])].copy()
print(f"There are {review_df.shape[0]} review records with {review_df.shape[1]} total features.")
display(review_df.head(3))
review_df[['stars', 'useful','funny','cool']].describe()

There are 5257329 review records with 9 total features.


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03


Unnamed: 0,stars,useful,funny,cool
count,5257329.0,5257329.0,5257329.0,5257329.0
mean,3.804201,1.021837,0.318659,0.5057937
std,1.393488,2.792205,1.728878,2.204489
min,1.0,-1.0,-1.0,-1.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,420.0,792.0,404.0


## Edge determination

For any business $i$ and $j$, find the number of users who have been to these $2$ places based on the review data.

First, for each user, find all business ids that they have reviewed on

In [8]:
visit_by_user = review_df.groupby('user_id')['business_id'].apply(list).to_frame()
print(f"There're {len(visit_by_user)} users")
visit_by_user

There're 1532233 users


Unnamed: 0_level_0,business_id
user_id,Unnamed: 1_level_1
---2PmXbF47D870stH1jqA,"[hKameFsaXh9g8WQbv593UA, hKameFsaXh9g8WQbv593U..."
---UgP94gokyCDuB5zUssA,"[hKr-RKMVpj3gRkSWcjg3Zw, GBTPC53ZrG1ZBY3DT8Mbc..."
---r61b7EpVPkb4UVme5tA,"[fGYnHzFr1z2kv7bPRW6VMA, 5UN1B7XqZohGuULLNlWL1..."
---zemaUC8WeJeWKqS6p9Q,[eX7o_-s5TmDT-DMfTV4cmw]
--0DrQkM0FT-yCQRWw82uQ,[h-lRhCfPhnVTkcndGcyuVA]
...,...
zzz-M4QvkEpUWWPL9RTzLA,"[hn3Rg2JrhQoDJBEhrpuwWg, Y989Hsw30AYUMcyNjBz9i..."
zzzCg-_lpyYE82TlJCdwdw,[LttC5xNMFcgOg3bt_MlXTg]
zzzGgfvrSJ4AQeKtcgocIw,[XyYsl2OyoMi6OMvzsdcVoQ]
zzzMBVS73g3ZJ7qL8JyhiA,[3FKIev7ZB_KE6XHL9sUJCg]


Delete users that have reviewed only on one business (since they don't contribute to the existence of an edge).

In [19]:
visit_by_user = visit_by_user[visit_by_user['business_id'].apply(len) > 1].reset_index()
print(f"There're {len(visit_by_user)} users left after the filter")
visit_by_user

There're 652478 users left after the filter


Unnamed: 0,user_id,business_id
0,---2PmXbF47D870stH1jqA,"[hKameFsaXh9g8WQbv593UA, hKameFsaXh9g8WQbv593U..."
1,---UgP94gokyCDuB5zUssA,"[hKr-RKMVpj3gRkSWcjg3Zw, GBTPC53ZrG1ZBY3DT8Mbc..."
2,---r61b7EpVPkb4UVme5tA,"[fGYnHzFr1z2kv7bPRW6VMA, 5UN1B7XqZohGuULLNlWL1..."
3,--0kuuLmuYBe3Rmu0Iycww,"[qcguEeAMP0XwFLYqhwX2hg, ldiSAMJER1BzfWa-j_Wts..."
4,--0nrvOSAIuhL1Tk4qTrvQ,"[mJ_THREGbgGRWshvtQ-AVQ, zCELc_Vqc0EbnTwbkpUIEA]"
...,...,...
652473,zzwYLnmIvj8C7wJmRjtkRA,"[j-qtdD55OLfSqfsWuQTDJg, pXRrRf8fDv6yU3xp1E25hA]"
652474,zzxUT-fFNXpp5i5xJHmvpA,"[A0zZI5AnaeK4-vKoc9ABmg, D5iuqgLt-M6INWydvMKUxw]"
652475,zzxZW6U5lCCQQeVfLLU6gw,"[OHauRWEh34imGRMuE3o5VA, aO7F2PmXkV4RoS3XHwtL_..."
652476,zzyTsajskrf_Ha8gU4HmTA,"[QAMJIJQ7SMO-C5xUN37iIw, NDzQFHW671hbB-aArOy12..."


In [21]:
# def visit_both(x, i, j):
#     return (i in x) and (j in x)

# num_users_visiting = visit_by_user['business_id'].apply(visit_both, args = ("A0zZI5AnaeK4-vKoc9ABmg", "D5iuqgLt-M6INWydvMKUxw"))
# sum(num_users_visiting)

1

In [22]:
busi_ids = busi_df['business_id'].tolist()

def visit_both(x, i, j):
    return (i in x) and (j in x)

edge_list = []
for b1, b2 in combinations(busi_ids, 2):
    num_users_visiting = sum(visit_by_user['business_id'].apply(visit_both, args = (b1, b2)))
    edge_list.append((b1, b2, num_users_visiting))

KeyboardInterrupt: 

In [27]:
len(edge_list)

7282

In [26]:
len([i for i in edge_list if i[2] > 0])

387

In [25]:
[i for i in edge_list if i[2] > 0]

[('MTSW4McQd7CbVtyjqoe9mw', 'MUTTqe8uqyMdBl186RmNeA', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'QdN72BWoyFypdGJhhI5r7g', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'Si6LXSR9gvAbmgO5DzV2cw', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'qfWJmJ0g96eM_fWma3ja0g', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', '-4dYswJy7SPcbcERvitmIg', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'ppFCk9aQkM338Rgwpl2F5A', 2),
 ('MTSW4McQd7CbVtyjqoe9mw', 'p184f-Zvf4ToPwLba0VS4A', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'dChRGpit9fM_kZK5pafNyA', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'od6skmfXz9twktEAuJHEmw', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'QWqKTWQ2OiDgo3dzNkpung', 7),
 ('MTSW4McQd7CbVtyjqoe9mw', 'eJ77e9lGxY3ArzaoDbHhYw', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', '516L9-ZWyqVgzqjP5uW7Jg', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', '7mpYTDb24SywNMRn3yeakQ', 3),
 ('MTSW4McQd7CbVtyjqoe9mw', 'JPm93BlP-UHYPqFgK66JUw', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', '5iuo1kvv0XZMS0bUOoLz2Q', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', 'ngvE1G9IckUO-ljZBFo23w', 1),
 ('MTSW4McQd7CbVtyjqoe9mw', '46_2e1hnQ4csLrYnhn6bkw', 1),
 ('MTSW4McQd7C

In [None]:
df = pd.DataFrame(edge_list, columns =['b1', 'b2', 'num_users'])
df

## Construct the network graph

Vertices = businesses

Edge between $i$ and $j$ indicates whether there's any user who has been to these two places

Note: See https://networkx.org/documentation/latest/reference/introduction.html#networkx-basics for `nx` package documentation

In [None]:
G = nx.Graph()
G.add_weighted_edges_from(edge_list) # or use G.add_edges_from(edge_list) for unweifhted graph

## Split train and test

## Compute similarity scores using different approaches