# Similarity-Based Link Prediction on Yelp Recommender

In [1]:
from random import choices
from itertools import combinations
from collections import defaultdict 

import numpy as np
import pandas as pd

import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Import

In [2]:
busi_df = pd.read_feather("data/yelp_business_cleaned.feather")
print(f"There are {busi_df.shape[0]} business records with {busi_df.shape[1]} total features.")
display(busi_df.head(3))
busi_df[['stars', 'review_count']].describe()

There are 68054 business records with 84 total features.


Unnamed: 0,original_index,business_id,name,address,city,state,postal_code,latitude,longitude,stars,...,music_live,music_video,music_karaoke,hours_Monday,hours_Tuesday,hours_Wednesday,hours_Thursday,hours_Friday,hours_Saturday,hours_Sunday
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,...,,,,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
1,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,...,,,,closed,closed,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0,12:0-18:0
2,5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,...,,,,0:0-0:0,6:0-22:0,6:0-22:0,6:0-22:0,9:0-0:0,9:0-22:0,8:0-22:0


Unnamed: 0,stars,review_count
count,68054.0,68054.0
mean,3.563949,74.619861
std,0.851437,169.900997
min,1.0,5.0
25%,3.0,11.0
50%,3.5,27.0
75%,4.0,73.0
max,5.0,7568.0


In [3]:
busi_df.groupby('state')['original_index'].count()

state
AB     3048
AZ     3579
CA     1695
DE     1171
FL    11476
ID     1798
IL     1198
IN     5374
LA     4888
MO     5366
NJ     4216
NV     2462
PA    16170
TN     5613
Name: original_index, dtype: int64

### Filter only businesses in PA and FL for computation due to resource contraints

In [4]:
busi_df = busi_df[busi_df['state'].isin(['PA', 'FL'])]

### Get review data

In [5]:
review_df = pd.read_feather("data/yelp_review.feather")
review_df = review_df.loc[review_df['business_id'].isin(busi_df['business_id'])].copy()
print(f"There are {review_df.shape[0]} review records with {review_df.shape[1]} total features.")
display(review_df.head(3))
# review_df[['stars', 'useful','funny','cool']].describe()

There are 2087772 review records with 9 total features.


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31


Unnamed: 0,stars,useful,funny,cool
count,2087772.0,2087772.0,2087772.0,2087772.0
mean,3.787049,1.046603,0.3262353,0.504763
std,1.392529,2.722631,1.45271,2.130276
min,1.0,-1.0,-1.0,-1.0
25%,3.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0
75%,5.0,1.0,0.0,0.0
max,5.0,227.0,227.0,207.0


## Edge determination

For any business $i$ and $j$, find the number of users who have been to these $2$ places based on the review data.

First, for each user, find all business ids that they have reviewed on

In [6]:
visit_by_user = review_df.groupby('user_id')['business_id'].apply(list).to_frame()
print(f"There're {len(visit_by_user)} users")
visit_by_user

There're 588266 users


Unnamed: 0_level_0,business_id
user_id,Unnamed: 1_level_1
---2PmXbF47D870stH1jqA,"[hKameFsaXh9g8WQbv593UA, hKameFsaXh9g8WQbv593U..."
---r61b7EpVPkb4UVme5tA,"[fGYnHzFr1z2kv7bPRW6VMA, 5UN1B7XqZohGuULLNlWL1..."
--0kuuLmuYBe3Rmu0Iycww,"[qcguEeAMP0XwFLYqhwX2hg, Y5S_AUSW8EjswVf9JAi-0w]"
--13zE3NaRvLSrmfTVnFJA,[U2y7fsqDgxAXskoJNVxbwg]
--1oZcRo9-QKOtTqREKB6g,"[p_qSQwShIgQnNxGcajI4-w, QHWYlmVbLC3K6eglWoHVvA]"
...,...
zzwYLnmIvj8C7wJmRjtkRA,"[j-qtdD55OLfSqfsWuQTDJg, pXRrRf8fDv6yU3xp1E25hA]"
zzx7J3zheFF3zf5YYfDAMg,[pm1bStJuol5XmxE_atZhCQ]
zzxCh58BAynQseL1rUlJqg,[tOPDno-cu5NQO56FeOBg-g]
zzzCg-_lpyYE82TlJCdwdw,[LttC5xNMFcgOg3bt_MlXTg]


Delete users that have reviewed only on one business (since they don't contribute to the existence of an edge).

In [7]:
visit_by_user = visit_by_user[visit_by_user['business_id'].apply(len) > 1].reset_index()
print(f"There're {len(visit_by_user)} users left after the filter")
visit_by_user

There're 252118 users left after the filter


Unnamed: 0,user_id,business_id
0,---2PmXbF47D870stH1jqA,"[hKameFsaXh9g8WQbv593UA, hKameFsaXh9g8WQbv593U..."
1,---r61b7EpVPkb4UVme5tA,"[fGYnHzFr1z2kv7bPRW6VMA, 5UN1B7XqZohGuULLNlWL1..."
2,--0kuuLmuYBe3Rmu0Iycww,"[qcguEeAMP0XwFLYqhwX2hg, Y5S_AUSW8EjswVf9JAi-0w]"
3,--1oZcRo9-QKOtTqREKB6g,"[p_qSQwShIgQnNxGcajI4-w, QHWYlmVbLC3K6eglWoHVvA]"
4,--2bpE5vyR-2hAP7sZZ4lA,"[BjBDHqHhMXSxgyVipccznQ, TV81bpCQ6p6o4Hau5hk-z..."
...,...,...
252113,zzrhWsiCwAKQzbgMZIOtgg,"[vpLMV6pHa1oI71jYaCVFzA, zvixukO8M4PCmMMaZOldPg]"
252114,zzu2hGJ6O9mP5yg6fjtvzg,"[4bvQThX0cJxlx67PU9nmLQ, TwnzM8mJn_nT2PJf1x-9k..."
252115,zzvCl_egPyWpxO7EvWc2IA,"[pW5DXTpKnw3y0fopipbJVg, t9P1At2Cw8PO5NdjIZjrK..."
252116,zzw0Z6-_VDp9ShIRSKIsQw,"[9kjcWWo0pZ5qr1ZpAXBFnQ, gpTC5qka3HCQqnSyXloph..."


In [8]:
edge_count = {}
for index, row in visit_by_user.iterrows():
    for b1, b2 in combinations(sorted(row['business_id']), 2):
        if (b1, b2) not in edge_count:
            edge_count[b1, b2] = 1
        else:
            edge_count[b1, b2] += 1
    if index % 10000 == 0:
        print(f"...{index}")

...0
...10000
...20000
...30000
...40000
...50000
...60000
...70000
...80000
...90000
...100000
...110000
...120000
...130000
...140000
...150000
...160000
...170000
...180000
...190000
...200000
...210000
...220000
...230000
...240000
...250000


In [9]:
print(f"Total number of business pairs: {len(edge_count)}")

Total number of business pairs: 20653315


In [10]:
busi_pairs = edge_count.keys()
num_users = edge_count.values()
busi1 = [b[0] for b in busi_pairs]
busi2 = [b[1] for b in busi_pairs]
edge_df = pd.DataFrame({'b1': busi1,
                        'b2': busi2,
                        'num_users': num_users})
edge_df

Unnamed: 0,b1,b2,num_users
0,0ZsqqzHu1HHkDdIKoivi5g,1An4DxtMmvvSe0HX4viRCA,4
1,0ZsqqzHu1HHkDdIKoivi5g,3YqUe2FTCQr0pPVK8oCv6Q,105
2,0ZsqqzHu1HHkDdIKoivi5g,3gXgILE2YWVidJDvVWBT6Q,6
3,0ZsqqzHu1HHkDdIKoivi5g,HpWi2CRJlxVCYKd8kS0X-A,4
4,0ZsqqzHu1HHkDdIKoivi5g,KP5OncF2jhT7_J1phHPPww,69
...,...,...,...
20653310,KzSL2VzyT7bdHd0bhMyanw,sQhh7JCGpqNgf0hHWc4m8g,1
20653311,KzSL2VzyT7bdHd0bhMyanw,t9P1At2Cw8PO5NdjIZjrKA,1
20653312,pW5DXTpKnw3y0fopipbJVg,t9P1At2Cw8PO5NdjIZjrKA,1
20653313,9kjcWWo0pZ5qr1ZpAXBFnQ,d9gvn2Nu_0qFzQLhAP9XFQ,1


In [11]:
edge_df.sort_values('num_users', ascending = False)

Unnamed: 0,b1,b2,num_users
1760,IkY2ticzHEn4QFn8hQLSWg,PP3BBaVxZLcJU54uP_wL6Q,1446
47393,PP3BBaVxZLcJU54uP_wL6Q,ytynqOUb3hjKeJfRj5Tshw,566
46874,6ajnOk0GcY9xbb5Ocaw8Gw,ctHjyadbDQAtUFfkcAFEHw,532
102,L5LLN0RafiV1Z9cddzvuCw,QHWYlmVbLC3K6eglWoHVvA,525
25493,8pqdJjefYq-a9IBSJJmKwA,j-qtdD55OLfSqfsWuQTDJg,496
...,...,...,...
10141437,ZslqrrUvqh7gzSPSShgmDA,vaS67igoAR80797pwyqpVA,1
10141439,ZslqrrUvqh7gzSPSShgmDA,vyUbMQDHGaCOH9BdLScxtg,1
10141441,ZslqrrUvqh7gzSPSShgmDA,wDsyh_29ycKVC3pefiqO6w,1
10141442,ZslqrrUvqh7gzSPSShgmDA,wNVrrq0KP2mTS3IPnzMezg,1


In [12]:
# edge_df.to_csv("data/business_edge_user_count.csv")
edge_df.to_feather("data/business_edge_user_count.feather")

## Construct the network graph

Vertices = businesses

Edge between $i$ and $j$ indicates whether there's any user who has been to these two places

Note: See https://networkx.org/documentation/latest/reference/introduction.html#networkx-basics for `nx` package documentation

In [13]:
G = nx.Graph()
G.add_weighted_edges_from(edge_list) # or use G.add_edges_from(edge_list) for unweifhted graph

NameError: name 'edge_list' is not defined

## Split train and test

## Compute similarity scores using different approaches