In [4]:
import os
import pandas as pd
import networkx as nx
import json
from collections import Counter
from itertools import combinations
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
from fa2_modified import ForceAtlas2
import re
import numpy as np

In [None]:
"""
# Step 1: Count the number of reviews per user
with open('yelp_academic_dataset_review.json', 'r') as f:
    user_counts = Counter(json.loads(line)['user_id'] for line in f)

# Step 2: Filter and sort the lines, removing users with fewer than 5 reviews
with open('yelp_academic_dataset_review.json', 'r') as f, open('sorted_review_file.json', 'w') as out_f:
    lines = [json.loads(line) for line in f if user_counts[json.loads(line)['user_id']] == 20]
    lines.sort(key=lambda x: (x['business_id']))

    # Write sorted and filtered data to the new file
    for line in lines:
        out_f.write(json.dumps(line) + "\n")
        print(line) 
"""

In [None]:
"""number_of_reviews = 20
count_users_reviews = sum(count == number_of_reviews for count in user_counts.values())
print(f'Number of users that has reviewed {number_of_reviews} times: {count_users_reviews}')"""

In [5]:
df = pd.read_json(f'data/sorted_review_data.json', lines=True)
chunk = df.copy()
#chunk.to_csv(f'Files/yelp_academic_dataset_review_small.csv', index=False)

In [6]:
chunk.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,T5442QeVY13MWJuKbvtgnQ,6SoUQtbIltsun0IIGqWTqA,---kPU91CF4Lq2-WlRu9Lw,4,0,0,0,Excellent food and service. Portions are a lit...,2021-11-28 16:40:02
1,c0PMuZecG8uKZpzlYEs36Q,QM3cNh-u31Z5alYC3h1RnA,--FcbSxK1AoEtEAxOgBaCw,1,1,1,0,I used to come here quite frequently but never...,2018-05-13 13:12:18
2,S8-rmUixe0CANmCMblYl2A,qi6OJ4D1e_T_AUQGHtadvA,--MbOh2O1pATkXa7xbU6LA,4,1,0,0,"For what it is, they make a solid product. Li...",2013-04-14 22:31:08
3,dAoF6rmPQY72tejRqEwDKA,qgLwRvE6JkK_KnbuT5AKmA,--O3ip9NpXTKD4oBS1pY2A,4,7,2,3,There are only a few Kid Friendly places in Sa...,2017-02-11 18:55:08
4,fqWMiA-8VpG-eTuF1YzZGQ,4lng77gRdpRb24o91sSihA,--S43ruInmIsGrnnkmavRw,3,3,1,1,While I liked the layout and some of the selec...,2015-03-22 23:56:41


In [15]:
# number of reviews
print("Number of reviews = ", len(chunk))
# find distinct users and businesses
users = chunk['user_id'].unique()
businesses = chunk['business_id'].unique()

print(f'Number of users: {len(users)}')
print(f'Number of businesses: {len(businesses)}')

Number of reviews =  60480
Number of users: 3024
Number of businesses: 31533


In [8]:
# Group by business_id and collect user_ids for each business
business_groups = df.groupby('business_id')['user_id'].apply(list)

# Initialize an empty graph
G = nx.Graph()

# Iterate through each group of users who reviewed the same business
for users in business_groups:
    # Add edges between all pairs of users for the same business
    for user1, user2 in combinations(users, 2):
        if user1 != user2:  # Ensure no self-loops
            if G.has_edge(user1, user2):
                # Increment weight if edge already exists
                G[user1][user2]['weight'] += 1
            else:
                # Create an edge with weight=1 if it doesn't exist
                G.add_edge(user1, user2, weight=1)
                
# make each review a user have made a node attribute
user_reviews = df.groupby('user_id')['review_id'].apply(list).to_dict()
nx.set_node_attributes(G, user_reviews, 'reviews')

For the project, the greatest connected component (GCC) of the reviewer network is utilized, to build a strong foundation for the subsequent analysis. The GCC represents the largest, most cohesive subset of the graph, where all nodes are interconnected through shared business reviews. By centering the analysis on this component, the assumed advantages are listed below.

Firstly, the GCC encapsulates the core structure of the network, where the majority of interactions and relationships occur. This provides a clearer lens into the most relevant and active portions of the graph, avoiding noise from isolated nodes or small, disconnected components. Additionally, working with the GCC simplifies computational complexity by eliminating disconnected subgraphs, which can hinder the performance of algorithmic approaches applied in later stages of the project. The GCC allows for meaningful analysis of global network properties like degree distributions and clustering, which rely on connectivity to produce interpretable results. Focusing on this component also facilitates cleaner and more effective visualizations, highlighting large-scale patterns and dominant community structures without being cluttered by disconnected nodes.

While this approach may exclude smaller subgraphs, these disconnected components often represent niche interactions that can be analyzed separately for insights into outliers or unique behaviors. The GCC, however, remains the best representation of the network's main dynamics and interconnectedness.

In [9]:
# get the greatest component and save to file
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G = G.subgraph(Gcc[0])
nx.write_gml(G, 'graphs/init_graph.gml')

In [None]:
# plot the graph
nx.draw(G, with_labels=False, node_size=10)
# print number of nodes and edges
print(f'Number of nodes: {G.number_of_nodes()}')
print(f'Number of edges: {G.number_of_edges()}')

array(['---kPU91CF4Lq2-WlRu9Lw', '--FcbSxK1AoEtEAxOgBaCw',
       '--MbOh2O1pATkXa7xbU6LA', ..., 'zzbZtgPYZS8sTIWQH6DwEw',
       'zzfj1-iPfw0cwnOjY0yUgA', 'zzyx5x0Z7xXWWvWnZFuxlQ'], dtype=object)

In [19]:
# save relevant business data

with open('data/yelp_academic_dataset_business.json', 'r') as f:
    business_data = pd.DataFrame([json.loads(line) for line in f])

business_data = business_data[business_data['business_id'].isin(businesses)]
business_data.to_csv('data/businesses_reviewed.csv', index=False)