# Connections Development

### Libraries

In [1]:
# import libraries

import pandas as pd
from utils import load_filtered_data

# pd.set_option('display.max_rows', None)

In [2]:
# major variables

city = 'Sicklerville'
data = load_filtered_data(city)

### Data preprocessing

In [3]:
# data preprocessing

data['business'] = data['business'].drop(['city', 'state'], axis=1)
data['business']['categories'] = data['business']['categories'].fillna('Other')

data['user']['friends'] = data['user']['friends'].fillna('')

### Feature engineering

In [4]:
# dataframe for user reviews

user_reviews_df = data['review'].groupby('user_id')['business_id'].agg(list).reset_index()
user_reviews_df.columns = ['user_id', 'business_ids']

user_reviews_df

Unnamed: 0,user_id,business_ids
0,-0MIp6WKJ8QvGnYZQ5ETyg,[2jy1GXVEHFIZZDV9rNBzcQ]
1,-23fvrDipXmZf9edeiF2yA,[QiQl1_ErNV_wcQS0Bgfg8A]
2,-27LxQq2L2oXHPYXzspBNA,[FqHuK8GngSTqD-64mIdBrw]
3,-3CNoR9qVyD_P_K1Hvujbw,[nVRPnQOHahdw3N5PFwoYwA]
4,-6MrOJCN-eIe37-LT-uPoQ,"[hHTo8EhV8nLPLcO4SHmQgQ, NbajxjmlNjYfUNr1osXpY..."
...,...,...
2885,ztj0Iln1_ahMyDu9WXQNeA,"[W4EklwiIeQNH6QhkcGllQQ, kEsHQJADJRgsPMjio88mYg]"
2886,zvQmf9h7UpKiqt3XU7hppQ,[3a4WbB1do5QXwtVEwDiZcw]
2887,zvYSqlpOr2Nhi6ct_91vYw,"[NbajxjmlNjYfUNr1osXpYw, j1Gkb3wSeo3klOEAhL75c..."
2888,zxFEGpkzzdAb7qR6e6qWwQ,[lQZDL1J6xDjL7PG-5CgkGA]


In [5]:
# dataframe for user tips

user_tips_df = data['tip'].groupby('user_id')['business_id'].agg(list).reset_index()
user_tips_df.columns = ['user_id', 'business_ids']

user_tips_df

Unnamed: 0,user_id,business_ids
0,-7fV-5KlUUMatLOiIOBT-g,[TOVprh8VkxDRMwU-MlRgJg]
1,-FpdhQ3h2YQgKMtF7NX85w,[BreAoaPXLkmLRU0xeSzQPw]
2,-G7Zkl1wIWBBmD0KRy_sCw,[0pL-eq0ufX5jMDgVAvoHHw]
3,-XZOz3ViFET3IZFRGFoOpQ,"[QCZvkvipsfyZ_5uBy3PkpQ, QCZvkvipsfyZ_5uBy3PkpQ]"
4,-kAQ-PSKx3i0st2B4aPpTg,[0p7kYtoqwJScvJjMjXYr7w]
...,...,...
431,z_79z8n_uFOKbluPlZrtFg,[hHTo8EhV8nLPLcO4SHmQgQ]
432,zb1dJhjlW3c-IAMr0OjZaQ,[Bw2X8gqOouIIznH2x_bLeQ]
433,zhx48x_XOjJQBA0DMnPG2Q,[oGDLaTRLPLz-R_KyQ3TzPA]
434,zkamNMEjihh3zN7lC7_WVw,"[NbajxjmlNjYfUNr1osXpYw, i39XLqAolwphb5zkZVId_A]"


In [6]:
# dataframe for user categories (reviewed and tipped)

review_business_data = pd.merge(data['review'], data['business'][['business_id', 'categories']], on='business_id', how='left')
tip_business_data = pd.merge(data['tip'], data['business'][['business_id', 'categories']], on='business_id', how='left')
merged_data = pd.concat([review_business_data, tip_business_data])

user_categories_df = merged_data.groupby('user_id')['categories'].agg(lambda x: list(set(x))).reset_index()
user_categories_df.columns = ['user_id', 'categories']

user_categories_df

Unnamed: 0,user_id,categories
0,-0MIp6WKJ8QvGnYZQ5ETyg,"[Fast Food, Salad, Restaurants, Chicken Shop, ..."
1,-23fvrDipXmZf9edeiF2yA,"[Appliances & Repair, Local Services]"
2,-27LxQq2L2oXHPYXzspBNA,"[Restaurants, Caribbean, Soul Food]"
3,-3CNoR9qVyD_P_K1Hvujbw,"[Delis, Restaurants]"
4,-6MrOJCN-eIe37-LT-uPoQ,"[Coffee & Tea, Food, Mexican, Restaurants, Lat..."
...,...,...
3073,ztj0Iln1_ahMyDu9WXQNeA,"[Food, Restaurants, Mexican, Fast Food, Salad,..."
3074,zvQmf9h7UpKiqt3XU7hppQ,"[Skin Care, Health & Medical, Massage Therapy,..."
3075,zvYSqlpOr2Nhi6ct_91vYw,"[American (Traditional), Chicken Wings, Bars, ..."
3076,zxFEGpkzzdAb7qR6e6qWwQ,"[Restaurants, Asian Fusion]"


In [7]:
# dataframe for user friends

user_friends_df = data['user'][['user_id', 'friends']]
user_friends_df.loc[:, 'friends'] = user_friends_df['friends'].str.split(', ')

user_friends_df

Unnamed: 0,user_id,friends
0,RgDVC3ZUBqpEe6Y1kPhIpw,"[sHozd2pcOKwHtPr8VlZJfg, 7mL9cvICl8fuCQTM89a-S..."
1,zkamNMEjihh3zN7lC7_WVw,"[YLzb3jjc3p-XAOBRggcrgA, 9lcq01JUhRY2yW8AxU3TI..."
2,gVFxZMcuG_Tal2_TnpmUPg,"[uUh_q6rVMhKxwt0eaiphRg, jRqWx2z7jdSzpvqqswqNn..."
3,g0_x4kVvJAYuk96oCcbOmw,"[_BcWyKQL16ndpBdggh2kNA, mx50KP6rXgutH36fhnL6V..."
4,-NbeVN5tnwdyYAvdNkKMjw,"[9IRuYmy5YmhtNQ6ei1p-uQ, 6NvkQATK0J2nmEWsuzhHi..."
...,...,...
3073,CUE13op55l9WZp6fBTUWnA,[]
3074,cACn6Qc1owO7hV_ZhcP74Q,[]
3075,SpTYEhfInvALG1le1uu21w,[]
3076,sGOCaCP7_SMlaRKPNO__CQ,[]


### Creation of nodes and edges

In [8]:
# user nodes

nodes = data['user'][['user_id', 'name']].rename(columns={'user_id': 'id'})

nodes

Unnamed: 0,id,name
0,RgDVC3ZUBqpEe6Y1kPhIpw,Monica
1,zkamNMEjihh3zN7lC7_WVw,Matthew
2,gVFxZMcuG_Tal2_TnpmUPg,Stephanie
3,g0_x4kVvJAYuk96oCcbOmw,Michael
4,-NbeVN5tnwdyYAvdNkKMjw,Dominic
...,...,...
3073,CUE13op55l9WZp6fBTUWnA,Nicole
3074,cACn6Qc1owO7hV_ZhcP74Q,Allison
3075,SpTYEhfInvALG1le1uu21w,Ray
3076,sGOCaCP7_SMlaRKPNO__CQ,Tahjae


#### Friendship edges

In [9]:
# edges representing user friendships

friendship_edges = []

for idx, row in user_friends_df.iterrows():
    user_id = row['user_id']
    friends_list = row['friends'] 

    for friend in friends_list:
        if friend in nodes['id'].values:
            friendship_edges.append((user_id, friend, 1, 'friendship'))

friendship_edges = pd.DataFrame(friendship_edges, columns=['from', 'to', 'weight', 'type'])
friendship_edges = friendship_edges.drop_duplicates()

friendship_edges

Unnamed: 0,from,to,weight,type
0,RgDVC3ZUBqpEe6Y1kPhIpw,ET8n-r7glWYqZhuR6GcdNw,1,friendship
1,RgDVC3ZUBqpEe6Y1kPhIpw,A_BF2dDDUTKGVXrqxO9mag,1,friendship
2,RgDVC3ZUBqpEe6Y1kPhIpw,r3ov7FgibBx41_W74I1KiA,1,friendship
3,RgDVC3ZUBqpEe6Y1kPhIpw,ApJ9YgYU-AhS4a-F5oTGHw,1,friendship
4,RgDVC3ZUBqpEe6Y1kPhIpw,cXtE-ZTrNx-hE7usp_f_xg,1,friendship
...,...,...,...,...
3553,XK-y2bmfTHkvYcXhZdUsUw,V_GUiV9qmwiHf2mpoe6Z5A,1,friendship
3554,YNVgWbJpVqi16hd_TRTmSA,ePxjx8tee06mn5h87X-Sxg,1,friendship
3555,g2wm59vqikKev1UxWtPwpw,7PgEfFn3tX__MeZANY3tYQ,1,friendship
3556,No6DEuOu_In4ncAUg0MRtA,-u5CsCUHOvP5OHYaNGteZw,1,friendship


In [10]:
# save nodes and friendship edges

connection = 'friendships'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
friendship_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Review edges

In [11]:
# edges representing reviews on same businesses

review_edges = []

user_reviews_exploded = user_reviews_df.explode('business_ids')

review_business_pairs = pd.merge(user_reviews_exploded, user_reviews_exploded, on='business_ids')
review_business_pairs = review_business_pairs[review_business_pairs['user_id_x'] != review_business_pairs['user_id_y']]

common_businesses_count = review_business_pairs.groupby(['user_id_x', 'user_id_y'])['business_ids'].count().reset_index()

review_edges = common_businesses_count.rename(columns={'user_id_x': 'from', 'user_id_y': 'to', 'business_ids': 'weight'})
review_edges['type'] = 'reviewed_same_business'
review_edges = review_edges.drop_duplicates()

review_edges

Unnamed: 0,from,to,weight,type
0,-0MIp6WKJ8QvGnYZQ5ETyg,-ZCf13dTuzGKMwKFRxTUwQ,1,reviewed_same_business
1,-0MIp6WKJ8QvGnYZQ5ETyg,-u5CsCUHOvP5OHYaNGteZw,1,reviewed_same_business
2,-0MIp6WKJ8QvGnYZQ5ETyg,0cLvdT2ANqM7CL93gccVjA,1,reviewed_same_business
3,-0MIp6WKJ8QvGnYZQ5ETyg,2K6e4rSuEYKJpmU37Kk-oA,1,reviewed_same_business
4,-0MIp6WKJ8QvGnYZQ5ETyg,2RCnyrzyOQ1oA9jSgL7XPQ,1,reviewed_same_business
...,...,...,...,...
273707,zyTCJLh4oUsuRar4kxRfgg,vnSNGuRxa9p3Xrvlj3t03g,1,reviewed_same_business
273708,zyTCJLh4oUsuRar4kxRfgg,w7QCKFrjpaBHiPSOo2O3MQ,1,reviewed_same_business
273709,zyTCJLh4oUsuRar4kxRfgg,xvdkeWY0421VZicMbYmz2w,1,reviewed_same_business
273710,zyTCJLh4oUsuRar4kxRfgg,y2PUG5elRIz6xkQP30DScA,1,reviewed_same_business


In [12]:
# save nodes and review edges

connection = 'business_reviews'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
review_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Tip edges

In [13]:
# edges representing tips on same businesses

tip_edges = []

tips_exploded = user_tips_df.explode('business_ids')

tip_business_pairs = pd.merge(tips_exploded, tips_exploded, on='business_ids')
tip_business_pairs = tip_business_pairs[tip_business_pairs['user_id_x'] != tip_business_pairs['user_id_y']]

common_businesses_count = tip_business_pairs.groupby(['user_id_x', 'user_id_y'])['business_ids'].count().reset_index()

tip_edges = common_businesses_count.rename(columns={'user_id_x': 'from', 'user_id_y': 'to', 'business_ids': 'weight'})
tip_edges['type'] = 'tipped_same_business'
tip_edges = tip_edges.drop_duplicates()

tip_edges

Unnamed: 0,from,to,weight,type
0,-7fV-5KlUUMatLOiIOBT-g,19db0FoXZ3Z42YlxJNc3yA,1,tipped_same_business
1,-7fV-5KlUUMatLOiIOBT-g,EqxTRmQVFYOKDZAzg_jPUA,1,tipped_same_business
2,-7fV-5KlUUMatLOiIOBT-g,FDF8cb38lGF61RYXuKxF-A,1,tipped_same_business
3,-7fV-5KlUUMatLOiIOBT-g,_HVLI1r7CFgq7UgKJJduLw,1,tipped_same_business
4,-7fV-5KlUUMatLOiIOBT-g,dhLizr4a2oydrv15y56r0A,1,tipped_same_business
...,...,...,...,...
6081,zvYSqlpOr2Nhi6ct_91vYw,dLUJcC7l6fOM8oTc63SQSA,1,tipped_same_business
6082,zvYSqlpOr2Nhi6ct_91vYw,e3w7nYEWShsYUjbJSE577g,1,tipped_same_business
6083,zvYSqlpOr2Nhi6ct_91vYw,kgXtOag6mfRoIHa1LTjoug,1,tipped_same_business
6084,zvYSqlpOr2Nhi6ct_91vYw,vZsH7VYNIa42wo4-f1svEQ,1,tipped_same_business


In [14]:
# save nodes and tip edges

connection = 'business_tips'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
tip_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Category edges

In [15]:
# edges representing reviews or tips on common categories

category_edges = []

for i in range(len(user_categories_df)):
    for j in range(i+1, len(user_categories_df)):
        user1_categories = set(user_categories_df.loc[i, 'categories'])
        user2_categories = set(user_categories_df.loc[j, 'categories'])
        common_categories = user1_categories.intersection(user2_categories)

        if common_categories:
            edge = {
                'from': user_categories_df.loc[i, 'user_id'],
                'to': user_categories_df.loc[j, 'user_id'],
                'weight': len(common_categories),
                'type': 'common_categories'
            }
            category_edges.append(edge)

category_edges = pd.DataFrame(category_edges)
category_edges = category_edges.drop_duplicates()

category_edges

Unnamed: 0,from,to,weight,type
0,-0MIp6WKJ8QvGnYZQ5ETyg,-ZCf13dTuzGKMwKFRxTUwQ,1,common_categories
1,-0MIp6WKJ8QvGnYZQ5ETyg,-u5CsCUHOvP5OHYaNGteZw,1,common_categories
2,-0MIp6WKJ8QvGnYZQ5ETyg,0cLvdT2ANqM7CL93gccVjA,1,common_categories
3,-0MIp6WKJ8QvGnYZQ5ETyg,2K6e4rSuEYKJpmU37Kk-oA,1,common_categories
4,-0MIp6WKJ8QvGnYZQ5ETyg,2RCnyrzyOQ1oA9jSgL7XPQ,1,common_categories
...,...,...,...,...
173850,zg7fd5Lqylh6bEVzZksRAA,zxFEGpkzzdAb7qR6e6qWwQ,1,common_categories
173851,ziqmpeDagwM5_IOhQQL1-Q,zxFEGpkzzdAb7qR6e6qWwQ,1,common_categories
173852,zkUag7nMlqepogNJIi4EoA,zvYSqlpOr2Nhi6ct_91vYw,1,common_categories
173853,zkamNMEjihh3zN7lC7_WVw,zmS0zKxOejPVYVv-ivTBGg,1,common_categories


In [16]:
# save nodes and category edges

connection = 'categories'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
category_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)

#### Combined edges

In [17]:
# edges representing several interactions mixed

combined_edges = []

category_edges_copy = category_edges.copy()
review_edges_copy = review_edges.copy()
tip_edges_copy = tip_edges.copy()
friendship_edges_copy = friendship_edges.copy()

category_edges_weight = 1
review_edges_weight = 2
tip_edges_weight = 3
friendship_edges_weight = 4

category_edges_copy['weight'] *= category_edges_weight
review_edges_copy['weight'] *= review_edges_weight
tip_edges_copy['weight'] *= tip_edges_weight
friendship_edges_copy['weight'] *= friendship_edges_weight

combined_edges = pd.concat([category_edges_copy, review_edges_copy, tip_edges_copy, friendship_edges_copy])
combined_edges = combined_edges.groupby(['from', 'to', 'type']).agg({'weight': 'sum'}).reset_index()

combined_edges = combined_edges.drop_duplicates()

combined_edges

Unnamed: 0,from,to,type,weight
0,-0MIp6WKJ8QvGnYZQ5ETyg,-ZCf13dTuzGKMwKFRxTUwQ,common_categories,1
1,-0MIp6WKJ8QvGnYZQ5ETyg,-ZCf13dTuzGKMwKFRxTUwQ,reviewed_same_business,2
2,-0MIp6WKJ8QvGnYZQ5ETyg,-u5CsCUHOvP5OHYaNGteZw,common_categories,1
3,-0MIp6WKJ8QvGnYZQ5ETyg,-u5CsCUHOvP5OHYaNGteZw,reviewed_same_business,2
4,-0MIp6WKJ8QvGnYZQ5ETyg,0cLvdT2ANqM7CL93gccVjA,common_categories,1
...,...,...,...,...
457206,zyTCJLh4oUsuRar4kxRfgg,vnSNGuRxa9p3Xrvlj3t03g,reviewed_same_business,2
457207,zyTCJLh4oUsuRar4kxRfgg,w7QCKFrjpaBHiPSOo2O3MQ,reviewed_same_business,2
457208,zyTCJLh4oUsuRar4kxRfgg,xvdkeWY0421VZicMbYmz2w,reviewed_same_business,2
457209,zyTCJLh4oUsuRar4kxRfgg,y2PUG5elRIz6xkQP30DScA,reviewed_same_business,2


In [None]:
# save nodes and combined edges

connection = 'combined'

nodes.to_csv(f'nodes_and_edges/{city}_{connection}_nodes.csv', index=False)
combined_edges.to_csv(f'nodes_and_edges/{city}_{connection}_edges.csv', index=False)