# Connections Development

### Libraries

In [1]:
# import libraries

import pandas as pd
from utils import load_filtered_data

# pd.set_option('display.max_rows', None)

In [2]:
# major variables

city = 'Springfield'
data = load_filtered_data(city)

### Data preprocessing

In [3]:
# data preprocessing

data['business'] = data['business'].drop(['city', 'state'], axis=1)
data['business']['categories'] = data['business']['categories'].fillna('Other')

data['user']['friends'] = data['user']['friends'].fillna('')

### Feature engineering

In [4]:
# dataframe for user reviews

user_reviews_df = data['review'].groupby('user_id')['business_id'].agg(list).reset_index()
user_reviews_df.columns = ['user_id', 'business_ids']

user_reviews_df

Unnamed: 0,user_id,business_ids
0,-0NMQ69U_7lMxmUV-wqK8g,[4om4mR09Py3y-tklTb9laA]
1,-0RLwaN6UtUEd_oMoEmOdQ,"[LcfXZV38PcTUEnZoM2vzXQ, YHpoMRXJatNmnTklfGwUVA]"
2,-18_TmfN3l667ukOYRGPzw,[u2my0Zc-phifXpwXHK1fLg]
3,-19L3RUWzF5r2-Q08OEZaw,[Dk0Yw7R8gCnkW78-n_Gv0A]
4,-1Nnx04n8wXbr9kofA932w,[AUialivOAMMuHBt0043Gtw]
...,...,...
7345,zvyiQc2Wsb7kqR61ioXwdQ,[BV_pCw2X5ErMMbGMAo4HtA]
7346,zw50xgudf7xwM1_YWkuRKw,[l3FeusUZE2jsJ5uR-aNR7A]
7347,zwP7nyqroYJ_XweRkrRjng,[iWpS5QvlOqAthz-XIp09jA]
7348,zwcrm4WMUq0Duz6P5rr0sw,[m4Vxvy566FTJ-2GlTn3Obg]


In [5]:
# dataframe for user tips

user_tips_df = data['tip'].groupby('user_id')['business_id'].agg(list).reset_index()
user_tips_df.columns = ['user_id', 'business_ids']

user_tips_df

Unnamed: 0,user_id,business_ids
0,-4nU1RaOzfSAmQityGJAfw,[fuXrwJhfEsvMlrBGbigLJA]
1,-6y3e3PSQZvbVzz0KinimA,[gu9H6XoryCn07BfPsSdeSw]
2,-AYFJI20Mq8WZOJ0fMs6FA,[m4Vxvy566FTJ-2GlTn3Obg]
3,-NwtV6xvM4zSLrSioHUOMQ,"[wz_igrtwHBADxa9xEabmmw, iRu_gQgT9DLBVkELbJmYMw]"
4,-WUsD4lhcoVb6Q0rlWqvLg,"[eCs_kHmwPDzxrZL9Sm1aTg, QWia-PVZBnIyIXFYtt0Orw]"
...,...,...
902,zVw-VwVxuUNkEAAHhMZTaA,"[9datL-qMsIdhMHb3apzqeA, _-8TAMmIbDGkZCXAqSkFMg]"
903,zZ3hd15XkMMGxRFU2KAsxQ,[V7TaLZ5EB94hqRIZ1_xjGQ]
904,zi1b3m3KW0hGXGm6FhSSmA,[AN9alHIrRElARPBZ-aLUGw]
905,zjopT29NTSG4uUmVElQDIA,[JOKReNU6x3qMnIcI_kF4nA]


In [6]:
# dataframe for user categories (reviewed and tipped)

review_business_data = pd.merge(data['review'], data['business'][['business_id', 'categories']], on='business_id', how='left')
tip_business_data = pd.merge(data['tip'], data['business'][['business_id', 'categories']], on='business_id', how='left')
merged_data = pd.concat([review_business_data, tip_business_data])

user_categories_df = merged_data.groupby('user_id')['categories'].agg(lambda x: list(set(x))).reset_index()
user_categories_df.columns = ['user_id', 'categories']

user_categories_df

Unnamed: 0,user_id,categories
0,-0NMQ69U_7lMxmUV-wqK8g,"[Tacos, Restaurants, Juice Bars & Smoothies, S..."
1,-0RLwaN6UtUEd_oMoEmOdQ,"[Automotive, Car Dealers, Auto Repair, Body Sh..."
2,-18_TmfN3l667ukOYRGPzw,"[Accessories, Fashion, Women's Clothing, Shopp..."
3,-19L3RUWzF5r2-Q08OEZaw,"[Hotels, Event Planning & Services, Hotels & T..."
4,-1Nnx04n8wXbr9kofA932w,"[Restaurants, Chicken Wings, Salad, American (..."
...,...,...
7702,zvyiQc2Wsb7kqR61ioXwdQ,"[Burgers, Food, Restaurants, Fast Food, Ice Cr..."
7703,zw50xgudf7xwM1_YWkuRKw,"[Financial Services, Automotive, Auto Parts & ..."
7704,zwP7nyqroYJ_XweRkrRjng,"[American (Traditional), Caterers, Barbeque, E..."
7705,zwcrm4WMUq0Duz6P5rr0sw,"[Restaurants, American (Traditional), Barbeque]"


In [7]:
# dataframe for user friends

user_friends_df = data['user'][['user_id', 'friends']]
user_friends_df.loc[:, 'friends'] = user_friends_df['friends'].str.split(', ')

user_friends_df

Unnamed: 0,user_id,friends
0,IpLRJY4CP3fXtlEd8Y4GFQ,"[hdwDo7CLh9aN_9PckAos4Q, ci-mepWGgsgGT3sFI2mhM..."
1,RgDVC3ZUBqpEe6Y1kPhIpw,"[sHozd2pcOKwHtPr8VlZJfg, 7mL9cvICl8fuCQTM89a-S..."
2,om5ZiponkpRqUNa3pVPiRg,"[bQiST5YdWLSH_yk56ccApA, r7Zi52GqnJa6Tb0dqTKlk..."
3,KrIL3TIOJI-tjvU6BlcA-g,"[eUlLKiRbX3k4VS3Ko_WxTA, nmiOEav_m5eT08zz8BIi4..."
4,1sGYXSkJHPhJ6wQtc-RbZw,"[9w4vI3LRTA4JdJyzOxeEMw, fNSGyM1NJ91CTO_VfJ5as..."
...,...,...
7702,yEGClWugJyBMcOIIm_d0dA,[]
7703,q4mOU6YL850_OsrrE1_ScA,[]
7704,2brjqRYgw2CYcoDwSKTrog,[]
7705,f9Lm5TrNfVX1JmVXIbtCOg,[]


### Creation of nodes and edges

In [8]:
# user nodes

nodes = data['user'][['user_id', 'name']].rename(columns={'user_id': 'id'})

nodes

Unnamed: 0,id,name
0,IpLRJY4CP3fXtlEd8Y4GFQ,Robyn
1,RgDVC3ZUBqpEe6Y1kPhIpw,Monica
2,om5ZiponkpRqUNa3pVPiRg,Andrea
3,KrIL3TIOJI-tjvU6BlcA-g,Elisabeth
4,1sGYXSkJHPhJ6wQtc-RbZw,Kimberly
...,...,...
7702,yEGClWugJyBMcOIIm_d0dA,Lady
7703,q4mOU6YL850_OsrrE1_ScA,Megan
7704,2brjqRYgw2CYcoDwSKTrog,Katherine
7705,f9Lm5TrNfVX1JmVXIbtCOg,Bernae


#### Friendship edges

In [9]:
# edges representing user friendships

friendship_edges = []

for idx, row in user_friends_df.iterrows():
    user_id = row['user_id']
    friends_list = row['friends'] 

    for friend in friends_list:
        if friend in nodes['id'].values:
            friendship_edges.append((user_id, friend, 1, 'friendship'))

friendship_edges = pd.DataFrame(friendship_edges, columns=['from', 'to', 'weight', 'type'])
friendship_edges = friendship_edges.drop_duplicates()

friendship_edges

Unnamed: 0,from,to,weight,type
0,IpLRJY4CP3fXtlEd8Y4GFQ,_BcWyKQL16ndpBdggh2kNA,1,friendship
1,IpLRJY4CP3fXtlEd8Y4GFQ,lRRuTimITgwzoXLIM3g9qw,1,friendship
2,IpLRJY4CP3fXtlEd8Y4GFQ,z02XKjGqJMONH59_lUHlTQ,1,friendship
3,IpLRJY4CP3fXtlEd8Y4GFQ,ovq6MrgSLj8eUYlUY0dUiw,1,friendship
4,IpLRJY4CP3fXtlEd8Y4GFQ,RgDVC3ZUBqpEe6Y1kPhIpw,1,friendship
...,...,...,...,...
12635,_etEcDqw-wyFuF0g2LllTg,5qyowKWXdaH37CqRbzWo6A,1,friendship
12636,fhEyJ8lYziTjbKKpwlkRDw,2sZejr0qKi45-3X8huIgPA,1,friendship
12637,W2QcFD80uk93D3v8YXdd0g,FdItCTIYBCwm-g_VOqoIDQ,1,friendship
12638,WM00BFz-_R2v0yMipH4_yA,Kr0-RU8FBirkX9Zl9khxZg,1,friendship


In [10]:
# save nodes and friendship edges

connection = 'friendships'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
friendship_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Compliment edges

In [11]:
# edges representing user compliments

temp_edges_struct = {'from': [], 'to': [], 'weight': [], 'type': []}
users_vec_compliments = {}
for index, row in data['user'].iterrows():
    users_vec_compliments[row['user_id']] = [
        row['compliment_hot'], 
        row['compliment_more'], 
        row['compliment_profile'], 
        row['compliment_cute'], 
        row['compliment_list'], 
        row['compliment_note'], 
        row['compliment_plain'], 
        row['compliment_cool'], 
        row['compliment_funny'], 
        row['compliment_writer'], 
        row['compliment_photos']
    ]

k = 5
for user_id in users_vec_compliments:
    user_vec = users_vec_compliments[user_id]
    if sum(user_vec) == 0:
        continue
    similarities = {}
    for user_id2 in users_vec_compliments:
        if user_id2 != user_id:
            user_vec2 = users_vec_compliments[user_id2]
            similarity = sum([a*b for a,b in zip(user_vec, user_vec2)])
            similarities[user_id2] = similarity
    similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True))
    similarities = dict(list(similarities.items())[:k])
    for user_id2 in similarities:
        temp_edges_struct['from'].append(user_id)
        temp_edges_struct['to'].append(user_id2)
        temp_edges_struct['weight'].append(similarities[user_id2])
        temp_edges_struct['type'].append('compliments')

compliment_edges = pd.DataFrame(temp_edges_struct)
compliment_edges = compliment_edges.sort_values('weight', ascending=False).drop_duplicates(subset=['from', 'to'], keep='first').sort_index()

compliment_edges

Unnamed: 0,from,to,weight,type
0,IpLRJY4CP3fXtlEd8Y4GFQ,hWDybu_KvYLSdEFzGrniTw,718222,compliments
1,IpLRJY4CP3fXtlEd8Y4GFQ,6s-g2vFu12OemhiK3FJuOQ,488224,compliments
2,IpLRJY4CP3fXtlEd8Y4GFQ,GHoG4X4FY8D8L563zzPX5w,340984,compliments
3,IpLRJY4CP3fXtlEd8Y4GFQ,OgYi5x02MBukfxPNFe1ePg,224124,compliments
4,IpLRJY4CP3fXtlEd8Y4GFQ,om5ZiponkpRqUNa3pVPiRg,206676,compliments
...,...,...,...,...
15330,vL0jIpZO771Q6girrQSXoA,hWDybu_KvYLSdEFzGrniTw,2466,compliments
15331,vL0jIpZO771Q6girrQSXoA,GHoG4X4FY8D8L563zzPX5w,1676,compliments
15332,vL0jIpZO771Q6girrQSXoA,6s-g2vFu12OemhiK3FJuOQ,1590,compliments
15333,vL0jIpZO771Q6girrQSXoA,om5ZiponkpRqUNa3pVPiRg,1370,compliments


In [12]:
# save nodes and compliment edges

connection = 'compliments'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
compliment_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Review edges

In [13]:
# edges representing reviews on same businesses

review_edges = []

user_reviews_exploded = user_reviews_df.explode('business_ids')

review_business_pairs = pd.merge(user_reviews_exploded, user_reviews_exploded, on='business_ids')
review_business_pairs = review_business_pairs[review_business_pairs['user_id_x'] != review_business_pairs['user_id_y']]

common_businesses_count = review_business_pairs.groupby(['user_id_x', 'user_id_y'])['business_ids'].count().reset_index()

review_edges = common_businesses_count.rename(columns={'user_id_x': 'from', 'user_id_y': 'to', 'business_ids': 'weight'})
review_edges['type'] = 'reviewed_same_business'
review_edges = review_edges.drop_duplicates()

review_edges

Unnamed: 0,from,to,weight,type
0,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,1,reviewed_same_business
1,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,1,reviewed_same_business
2,-0NMQ69U_7lMxmUV-wqK8g,GZp8BBQ2sKHytOxDgKNXow,1,reviewed_same_business
3,-0NMQ69U_7lMxmUV-wqK8g,HMCaN4POajKGzWHNLwCAJQ,1,reviewed_same_business
4,-0NMQ69U_7lMxmUV-wqK8g,W2i38F065BmFEUd4ZDU8nQ,1,reviewed_same_business
...,...,...,...,...
903455,zzeRWIiPtuJNRBUcxe0Upw,zQ11HdLDyKY_9k7Up8WQ8g,1,reviewed_same_business
903456,zzeRWIiPtuJNRBUcxe0Upw,zQI5yZb366XrcXZK7AvgkA,1,reviewed_same_business
903457,zzeRWIiPtuJNRBUcxe0Upw,zTFdRBjXWwtziZwGjcxm4A,1,reviewed_same_business
903458,zzeRWIiPtuJNRBUcxe0Upw,zVw-VwVxuUNkEAAHhMZTaA,1,reviewed_same_business


In [14]:
# save nodes and review edges

connection = 'business_reviews'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
review_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Tip edges

In [15]:
# edges representing tips on same businesses

tip_edges = []

tips_exploded = user_tips_df.explode('business_ids')

tip_business_pairs = pd.merge(tips_exploded, tips_exploded, on='business_ids')
tip_business_pairs = tip_business_pairs[tip_business_pairs['user_id_x'] != tip_business_pairs['user_id_y']]

common_businesses_count = tip_business_pairs.groupby(['user_id_x', 'user_id_y'])['business_ids'].count().reset_index()

tip_edges = common_businesses_count.rename(columns={'user_id_x': 'from', 'user_id_y': 'to', 'business_ids': 'weight'})
tip_edges['type'] = 'tipped_same_business'
tip_edges = tip_edges.drop_duplicates()

tip_edges

Unnamed: 0,from,to,weight,type
0,-4nU1RaOzfSAmQityGJAfw,2cIlse4JbQjVB7qJowyguA,1,tipped_same_business
1,-4nU1RaOzfSAmQityGJAfw,6MHOoXqAhHEKI8oX6FW3Hg,2,tipped_same_business
2,-4nU1RaOzfSAmQityGJAfw,6skU7psOfkA6MulD5PzA2g,1,tipped_same_business
3,-4nU1RaOzfSAmQityGJAfw,B848ziwrE_Kn5MI9Pw9bpA,1,tipped_same_business
4,-4nU1RaOzfSAmQityGJAfw,EXJj7S_nsfqgraUUaYM3BQ,1,tipped_same_business
...,...,...,...,...
15289,zjopT29NTSG4uUmVElQDIA,23JH7GzkZfI49R3uFjwiBg,1,tipped_same_business
15290,zjopT29NTSG4uUmVElQDIA,Xvq7cNYh4sBAK7gGhRLZGw,1,tipped_same_business
15291,zjopT29NTSG4uUmVElQDIA,xMAuZIz1iGh6zw-nezVNYw,1,tipped_same_business
15292,zzeRWIiPtuJNRBUcxe0Upw,lnBEeWOkeVZapLa4FXMGXQ,1,tipped_same_business


In [16]:
# save nodes and tip edges

connection = 'business_tips'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
tip_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Category edges

In [17]:
# edges representing reviews or tips on common categories

category_edges = []

for i in range(len(user_categories_df)):
    for j in range(i+1, len(user_categories_df)):
        user1_categories = set(user_categories_df.loc[i, 'categories'])
        user2_categories = set(user_categories_df.loc[j, 'categories'])
        common_categories = user1_categories.intersection(user2_categories)

        if common_categories:
            edge = {
                'from': user_categories_df.loc[i, 'user_id'],
                'to': user_categories_df.loc[j, 'user_id'],
                'weight': len(common_categories),
                'type': 'common_categories'
            }
            category_edges.append(edge)

category_edges = pd.DataFrame(category_edges)
category_edges = category_edges.drop_duplicates()

category_edges

Unnamed: 0,from,to,weight,type
0,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,1,common_categories
1,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,1,common_categories
2,-0NMQ69U_7lMxmUV-wqK8g,GZp8BBQ2sKHytOxDgKNXow,1,common_categories
3,-0NMQ69U_7lMxmUV-wqK8g,HMCaN4POajKGzWHNLwCAJQ,1,common_categories
4,-0NMQ69U_7lMxmUV-wqK8g,W2i38F065BmFEUd4ZDU8nQ,1,common_categories
...,...,...,...,...
555980,zmLUS4Tqn-qzkg3ec6U9eg,zp-XjxYQPY1w8Le6GzI25Q,1,common_categories
555981,zoqZzpNvz06kEqtzcqHk1A,zqG8J7eo7IldyIyobwsNfw,1,common_categories
555982,zq1YDjAeoJjOibgll_RZrg,zwcrm4WMUq0Duz6P5rr0sw,1,common_categories
555983,zs-DHE3qf05M-YH2pw-WTA,ztDSO7DSGNtPN_XW5VFjvg,1,common_categories


In [18]:
# save nodes and category edges

connection = 'categories'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
category_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

In [19]:
# edges representing reviews or tips on common categories (with threshold)

threshold_category_edges = []
threshold = 1

for i in range(len(user_categories_df)):
    for j in range(i+1, len(user_categories_df)):
        user1_categories = set(user_categories_df.loc[i, 'categories'])
        user2_categories = set(user_categories_df.loc[j, 'categories'])
        common_categories = user1_categories.intersection(user2_categories)

        if len(common_categories) >= threshold:
            edge = {
                'from': user_categories_df.loc[i, 'user_id'],
                'to': user_categories_df.loc[j, 'user_id'],
                'weight': 1,
                'type': 'common_categories'
            }
            threshold_category_edges.append(edge)

threshold_category_edges = pd.DataFrame(threshold_category_edges)
threshold_category_edges = threshold_category_edges.drop_duplicates()

threshold_category_edges

Unnamed: 0,from,to,weight,type
0,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,1,common_categories
1,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,1,common_categories
2,-0NMQ69U_7lMxmUV-wqK8g,GZp8BBQ2sKHytOxDgKNXow,1,common_categories
3,-0NMQ69U_7lMxmUV-wqK8g,HMCaN4POajKGzWHNLwCAJQ,1,common_categories
4,-0NMQ69U_7lMxmUV-wqK8g,W2i38F065BmFEUd4ZDU8nQ,1,common_categories
...,...,...,...,...
555980,zmLUS4Tqn-qzkg3ec6U9eg,zp-XjxYQPY1w8Le6GzI25Q,1,common_categories
555981,zoqZzpNvz06kEqtzcqHk1A,zqG8J7eo7IldyIyobwsNfw,1,common_categories
555982,zq1YDjAeoJjOibgll_RZrg,zwcrm4WMUq0Duz6P5rr0sw,1,common_categories
555983,zs-DHE3qf05M-YH2pw-WTA,ztDSO7DSGNtPN_XW5VFjvg,1,common_categories


In [20]:
# save nodes and category edges (with threshold)

connection = 'threshold_categories'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
threshold_category_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Combined edges

In [21]:
# edges representing several interactions mixed

combined_edges = []

category_edges_copy = category_edges.copy()
review_edges_copy = review_edges.copy()
tip_edges_copy = tip_edges.copy()
friendship_edges_copy = friendship_edges.copy()

category_edges_weight = 1
review_edges_weight = 2
tip_edges_weight = 3
friendship_edges_weight = 4

category_edges_copy['weight'] *= category_edges_weight
review_edges_copy['weight'] *= review_edges_weight
tip_edges_copy['weight'] *= tip_edges_weight
friendship_edges_copy['weight'] *= friendship_edges_weight

combined_edges = pd.concat([category_edges_copy, review_edges_copy, tip_edges_copy, friendship_edges_copy])
combined_edges = combined_edges.groupby(['from', 'to', 'type']).agg({'weight': 'sum'}).reset_index()
combined_edges['type'] = 'mixed_interactions'

combined_edges = combined_edges.drop_duplicates()

combined_edges

Unnamed: 0,from,to,type,weight
0,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,mixed_interactions,1
1,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,mixed_interactions,2
2,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,mixed_interactions,1
3,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,mixed_interactions,2
4,-0NMQ69U_7lMxmUV-wqK8g,GZp8BBQ2sKHytOxDgKNXow,mixed_interactions,1
...,...,...,...,...
1487374,zzeRWIiPtuJNRBUcxe0Upw,zQ11HdLDyKY_9k7Up8WQ8g,mixed_interactions,2
1487375,zzeRWIiPtuJNRBUcxe0Upw,zQI5yZb366XrcXZK7AvgkA,mixed_interactions,2
1487376,zzeRWIiPtuJNRBUcxe0Upw,zTFdRBjXWwtziZwGjcxm4A,mixed_interactions,2
1487377,zzeRWIiPtuJNRBUcxe0Upw,zVw-VwVxuUNkEAAHhMZTaA,mixed_interactions,2


In [22]:
# save nodes and combined edges

connection = 'combined'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
combined_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

In [23]:
# edges representing categories and business reviews mixed

category_review_edges = []

category_edges_weight = 1
review_edges_weight = 2

category_edges_copy = category_edges.copy()
review_edges_copy = review_edges.copy()

category_edges_copy['weight'] *= category_edges_weight
review_edges_copy['weight'] *= review_edges_weight

category_review_edges = pd.concat([category_edges_copy, review_edges_copy])
category_review_edges = category_review_edges.groupby(['from', 'to']).agg({'weight': 'sum'}).reset_index()

category_review_edges['type'] = 'common_categories_and_reviews'

category_review_edges = category_review_edges.drop_duplicates()

category_review_edges


Unnamed: 0,from,to,weight,type
0,-0NMQ69U_7lMxmUV-wqK8g,CmYLbKdKk6A3qN0RbMdClQ,3,common_categories_and_reviews
1,-0NMQ69U_7lMxmUV-wqK8g,DDXXwbriQdORFhGHEp-2bA,3,common_categories_and_reviews
2,-0NMQ69U_7lMxmUV-wqK8g,GZp8BBQ2sKHytOxDgKNXow,3,common_categories_and_reviews
3,-0NMQ69U_7lMxmUV-wqK8g,HMCaN4POajKGzWHNLwCAJQ,3,common_categories_and_reviews
4,-0NMQ69U_7lMxmUV-wqK8g,W2i38F065BmFEUd4ZDU8nQ,3,common_categories_and_reviews
...,...,...,...,...
1007710,zzeRWIiPtuJNRBUcxe0Upw,zQ11HdLDyKY_9k7Up8WQ8g,2,common_categories_and_reviews
1007711,zzeRWIiPtuJNRBUcxe0Upw,zQI5yZb366XrcXZK7AvgkA,2,common_categories_and_reviews
1007712,zzeRWIiPtuJNRBUcxe0Upw,zTFdRBjXWwtziZwGjcxm4A,2,common_categories_and_reviews
1007713,zzeRWIiPtuJNRBUcxe0Upw,zVw-VwVxuUNkEAAHhMZTaA,2,common_categories_and_reviews


In [24]:
# save nodes and category review edges

connection = 'categories_and_reviews'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
category_review_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Priority Combined edges

In [27]:
# list of preferences in order of priority
list_of_preferences = ['friendship', 'reviewed_same_business', 'tipped_same_business', 'common_categories']

priority_combined_edges = pd.DataFrame(columns=['from', 'to', 'weight', 'type'])

def get_alone_users(nodes, edges):
    alone_users = set(nodes['id'].values) - set(edges['from'].values) - set(edges['to'].values)
    return pd.DataFrame(list(alone_users), columns=['id'])


def clean_edges(edges, users_to_keep):
    edges = edges[edges['from'].isin(users_to_keep)]
    edges = edges[edges['to'].isin(users_to_keep)]
    return edges


for preference in list_of_preferences:
    if preference == 'friendship':
        priority_combined_edges = pd.concat([priority_combined_edges, clean_edges(friendship_edges, get_alone_users(nodes, priority_combined_edges)['id'])])
    elif preference == 'reviewed_same_business':
        priority_combined_edges = pd.concat([priority_combined_edges, clean_edges(review_edges, get_alone_users(nodes, priority_combined_edges)['id'])])
    elif preference == 'tipped_same_business':
        priority_combined_edges = pd.concat([priority_combined_edges, clean_edges(tip_edges, get_alone_users(nodes, priority_combined_edges)['id'])])
    elif preference == 'common_categories':
        priority_combined_edges = pd.concat([priority_combined_edges, clean_edges(category_edges, get_alone_users(nodes, priority_combined_edges)['id'])])


# join the rest of the alone people with 0 weight
alone_users = get_alone_users(nodes, priority_combined_edges)
alone_users['weight'] = 0
alone_users['type'] = 'alone'

priority_combined_edges = pd.concat([priority_combined_edges, alone_users])
priority_combined_edges = priority_combined_edges.drop(columns=['id'], errors='ignore')
priority_combined_edges = priority_combined_edges.drop_duplicates()

priority_combined_edges

Unnamed: 0,from,to,weight,type
0,IpLRJY4CP3fXtlEd8Y4GFQ,_BcWyKQL16ndpBdggh2kNA,1,friendship
1,IpLRJY4CP3fXtlEd8Y4GFQ,lRRuTimITgwzoXLIM3g9qw,1,friendship
2,IpLRJY4CP3fXtlEd8Y4GFQ,z02XKjGqJMONH59_lUHlTQ,1,friendship
3,IpLRJY4CP3fXtlEd8Y4GFQ,ovq6MrgSLj8eUYlUY0dUiw,1,friendship
4,IpLRJY4CP3fXtlEd8Y4GFQ,RgDVC3ZUBqpEe6Y1kPhIpw,1,friendship
...,...,...,...,...
15279,zZ3hd15XkMMGxRFU2KAsxQ,YsSXh_mXj11SFGUF2dBPSQ,1,tipped_same_business
15280,zZ3hd15XkMMGxRFU2KAsxQ,d30bMRm0_zcSjvoKqTIphA,1,tipped_same_business
15285,zZ3hd15XkMMGxRFU2KAsxQ,rjw8dV3GxaMs6PECISJSTA,1,tipped_same_business
15287,zZ3hd15XkMMGxRFU2KAsxQ,v3DRj88Rr9DbldPzu-pA-g,1,tipped_same_business


In [28]:
# save nodes and priority combined edges

connection = 'priority_combined'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
priority_combined_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)