#### Importing libraries and directory settings

In [1]:
import numpy as np
import random
from collections import Counter 
import json
from tqdm import tqdm
from gensim.models import Word2Vec, KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

data_dir = '/home/deependra/project/23-hetero-smote/HeteroG/data/yelp_kaggle/'

business_file = 'yelp_academic_dataset_business.json'
review_file = 'yelp_academic_dataset_review.json'
user_file = 'yelp_academic_dataset_user.json'

### Preparing the subset of yelp dataset for the analysis

#### Loading json files

In [2]:
# Load business data

business_data = []
with open(data_dir + business_file, encoding='utf-8') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        business_data.append(list(json.loads(line).values()))
    business_header = dict(zip(json.loads(line).keys(), range(len(json.loads(line).keys()))))
        
print('Total number of business data: ', len(business_data))
print('business data columns: ', len(business_header))

100%|██████████| 150346/150346 [00:02<00:00, 57084.83it/s]

Total number of business data:  150346
business data columns:  14





In [3]:
review_data = []
with open(data_dir + review_file, encoding='utf-8') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        review_data.append(list(json.loads(line).values()))
    review_header = dict(zip(json.loads(line).keys(), range(len(json.loads(line).keys()))))

print('Total number of review data: ', len(review_data))
print('review data columns: ', len(review_header))

100%|██████████| 6990280/6990280 [00:43<00:00, 159091.84it/s]

Total number of review data:  6990280
review data columns:  9





In [4]:
# Load user data
user_data = []
with open(data_dir + user_file, encoding='utf-8') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        user_data.append(list(json.loads(line).values()))
    user_header = dict(zip(json.loads(line).keys(), range(len(json.loads(line).keys()))))
    
print('Total number of user data: ', len(user_data))
print('user data columns: ', len(user_header))

del lines, line

100%|██████████| 1987897/1987897 [00:17<00:00, 114726.31it/s]

Total number of user data:  1987897
user data columns:  22





In [5]:
print(f'business header: \n{business_header}\n')
print(f'user header: \n{user_header}\n')
print(f'review header: \n{review_header}\n')

business header: 
{'business_id': 0, 'name': 1, 'address': 2, 'city': 3, 'state': 4, 'postal_code': 5, 'latitude': 6, 'longitude': 7, 'stars': 8, 'review_count': 9, 'is_open': 10, 'attributes': 11, 'categories': 12, 'hours': 13}

user header: 
{'user_id': 0, 'name': 1, 'review_count': 2, 'yelping_since': 3, 'useful': 4, 'funny': 5, 'cool': 6, 'elite': 7, 'friends': 8, 'fans': 9, 'average_stars': 10, 'compliment_hot': 11, 'compliment_more': 12, 'compliment_profile': 13, 'compliment_cute': 14, 'compliment_list': 15, 'compliment_note': 16, 'compliment_plain': 17, 'compliment_cool': 18, 'compliment_funny': 19, 'compliment_writer': 20, 'compliment_photos': 21}

review header: 
{'review_id': 0, 'user_id': 1, 'business_id': 2, 'stars': 3, 'useful': 4, 'funny': 5, 'cool': 6, 'text': 7, 'date': 8}



#### Checking and adding businesses, reviews, and users

In [6]:
b_classes = []
for i in range(len(business_data)):
    if business_data[i][business_header['categories']] != None:
        b_classes.extend(business_data[i][business_header['categories']].split(', '))

print(Counter(b_classes).most_common(100))


[('Restaurants', 52268), ('Food', 27781), ('Shopping', 24395), ('Home Services', 14356), ('Beauty & Spas', 14292), ('Nightlife', 12281), ('Health & Medical', 11890), ('Local Services', 11198), ('Bars', 11065), ('Automotive', 10773), ('Event Planning & Services', 9895), ('Sandwiches', 8366), ('American (Traditional)', 8139), ('Active Life', 7687), ('Pizza', 7093), ('Coffee & Tea', 6703), ('Fast Food', 6472), ('Breakfast & Brunch', 6239), ('American (New)', 6097), ('Hotels & Travel', 5857), ('Home & Garden', 5799), ('Fashion', 5739), ('Burgers', 5636), ('Arts & Entertainment', 5434), ('Auto Repair', 5433), ('Hair Salons', 5046), ('Nail Salons', 4621), ('Mexican', 4600), ('Italian', 4573), ('Specialty Food', 4233), ('Doctors', 3763), ('Pets', 3758), ('Real Estate', 3577), ('Seafood', 3539), ('Fitness & Instruction', 3293), ('Professional Services', 3270), ('Hair Removal', 3239), ('Desserts', 3186), ('Chinese', 3169), ('Bakeries', 3150), ('Grocery', 3139), ('Salad', 3064), ('Hotels', 2977)

In [10]:
categories = ['Hotels']
business_ids = []
for i in tqdm(range(len(business_data))):
    if business_data[i][business_header['categories']]:
        category = business_data[i][business_header['categories']].split(', ')
        category = list(set(category).intersection(categories))
        if len(category) == 1: 
            business_ids.append(business_data[i][business_header["business_id"]])

business_ids = dict(zip(business_ids, range(len(business_ids))))
print(f'Number of hotel businesses: {len(business_ids)}')

100%|██████████| 150346/150346 [00:00<00:00, 406624.96it/s]

Number of hotel businesses: 2977





In [11]:
review_ids = []
user_ids = []
business_ids2 = []
stars = [0,0, 0, 0, 0] # stars 1 to 5 counts
print('Collecting all review ids and user ids...')
for i in tqdm(range(len(review_data))):
    if review_data[i][review_header['business_id']] in business_ids.keys() and review_data[i][review_header['text']] and review_data[i][review_header['stars']] in [1,3,5]:
        review_ids.append(review_data[i][review_header['review_id']])
        user_ids.append(review_data[i][review_header['user_id']])
        business_ids2.append(review_data[i][review_header['business_id']])
        stars[int(review_data[i][review_header['stars']]) - 1] += 1

# setting up review ids
print(f'Number of reviews: {len(review_ids)}')
print(f'review counts: 1stars: {stars[0]}, 2stars: {stars[1]}, 3stars: {stars[2]}, 4stars: {stars[3]}, 5stars: {stars[4]}')

# setting up user ids
user_ids = list(set(user_ids))
print(f'Current Number of users: {len(user_ids)}')

print(f'previous number of businesses: {len(business_ids)}')
business_ids = list(set(business_ids2))
print(f'Current number of businesses: {len(business_ids)}')

Collecting all review ids and user ids...


  0%|          | 0/6990280 [00:00<?, ?it/s]

100%|██████████| 6990280/6990280 [00:03<00:00, 2175508.08it/s]

Number of reviews: 126018
review counts: 1stars: 45400, 2stars: 0, 3stars: 24954, 4stars: 0, 5stars: 55664
Current Number of users: 109573
previous number of businesses: 2977
Current number of businesses: 2974





In [12]:
# checking for unuseable user ids

all_user_ids = []
print('Collecting all user ids...')
for i in tqdm(range(len(user_data))):
    all_user_ids.append(user_data[i][user_header['user_id']])
    
print('Checking for unuseable user ids...')
bad_user_ids = set(user_ids) - set(all_user_ids)
print(f'Number of bad user ids: {len(bad_user_ids)}')
 
bad_review_ids = [] # from bad user ids

for i in tqdm(range(len(review_data))):
    if review_data[i][review_header['user_id']] in bad_user_ids and review_data[i][review_header['review_id']] in review_ids:
        bad_review_ids.append(review_data[i][review_header['review_id']])
        
print(f'Number of bad review ids (same as bad user ids): {len(bad_review_ids)}')

# removing bad user ids
user_ids = list(set(user_ids) - bad_user_ids)

# removing bad review ids
review_ids = list(set(review_ids) - set(bad_review_ids))

user_ids = dict(zip(user_ids, range(len(user_ids)))) # nodes
review_ids = dict(zip(review_ids, range(len(review_ids)))) # edges
business_ids = dict(zip(business_ids, range(len(business_ids)))) #nodes

print(f'Final Number of users: {len(user_ids)}')
print(f'Final Number of reviews: {len(review_ids)}')
print(f'Final Number of businesses: {len(business_ids)}')

Collecting all user ids...


100%|██████████| 1987897/1987897 [00:01<00:00, 1850623.68it/s]


Checking for unuseable user ids...
Number of bad user ids: 0


100%|██████████| 6990280/6990280 [00:01<00:00, 3657599.24it/s]


Number of bad review ids (same as bad user ids): 0
Final Number of users: 109573
Final Number of reviews: 126018
Final Number of businesses: 2974


#### Mapping the business_id to review_id and them to user_id

In [13]:
ub_edges = []

user_id_index = review_header['user_id']
business_id_index = review_header['business_id']
review_id_index = review_header['review_id']

r_classes = []
for i in tqdm(range(len(review_data))):
    if review_data[i][review_id_index] in review_ids:
        ub_edges.append([user_ids[review_data[i][user_id_index]], business_ids[review_data[i][business_id_index]]])
        r_classes.append([review_ids[review_data[i][review_id_index]], int((review_data[i][review_header['stars']]) // 2)])

100%|██████████| 6990280/6990280 [00:03<00:00, 2151836.94it/s]

Number of user-business edges: 126018
Number of review samples with classes: 126018





#### Save the business_id, review_id, and user_id to txt files



In [21]:
print('Saving business ids...')
with open(data_dir + 'business_ids.txt', 'w') as f:
    for business,id in business_ids.items():
        f.write(f'{id}\t{business}\n')
print('Saving user ids...')
with open(data_dir + 'user_ids.txt', 'w') as f:
    for user,id in user_ids.items():
        f.write(f'{id}\t{user}\n')
print('Saving review ids...')
with open(data_dir + 'review_ids.txt', 'w') as f:
    for review,id in review_ids.items():
        f.write(f'{id}\t{review}\n')
print('Saving user  business edges...')
with open(data_dir + 'ub_edges.txt', 'w') as f:
    for edge in ub_edges:
        f.write(f'{edge[0]}\t{edge[1]}\n')
print('Saving review edge classes...')
with open(data_dir + 'review_edgeclasses.txt', 'w') as f:
    for edge in b_classes:
        f.write(f'{edge[0]}\t{edge[1]}\n')

Saving business ids...
Saving user ids...
Saving review ids...
Saving user review edges...
Saving review business edges...
Saving business classes...


#### Random walk with restart

In [None]:
load_data = True
if load_data:
    business_ids = {}
    user_ids = {}
    review_ids = {}
    ub_edges = []
    r_class = []
    
    with open(data_dir + 'business_ids.txt', 'r') as f:
        for line in tqdm(f.readlines()):
            business_ids[line.strip().split('\t')[1]] = int(line.strip().split('\t')[0])
    
    with open(data_dir + 'user_ids.txt', 'r') as f:
        for line in tqdm(f.readlines()):
            user_ids[line.strip().split('\t')[1]] = int(line.strip().split('\t')[0])
    
    with open(data_dir + 'review_ids.txt', 'r') as f:
        for line in tqdm(f.readlines()):
            review_ids[line.strip().split('\t')[1]] = int(line.strip().split('\t')[0])
    
    with open(data_dir + 'ub_edges.txt', 'r') as f:
        for line in tqdm(f.readlines()):
            ub_edges.append([int(line.strip().split('\t')[0]), int(line.strip().split('\t')[1])])
            
    with open(data_dir + 'review_edgeclasses.txt', 'r') as f:
        for line in tqdm(f.readlines()):
            r_class.append([int(line.strip().split('\t')[0]), int(line.strip().split('\t')[1])])
            
    print(f'TOtal number of business ids: {len(business_ids)}')
    print(f'TOtal number of user ids: {len(user_ids)}')
    print(f'TOtal number of review ids: {len(review_ids)}')
    print(f'TOtal number of ub edges: {len(ub_edges)}')
    print(f'TOtal number of r samples with classes: {len(r_class)}')

In [None]:
# finding all neighbors of each node

print('Creating all neighbors dict for each node...')
all_neighbors = {f'b{id}':[] for name,id in business_ids.items()}
all_neighbors.update({f'u{id}':[] for name,id in user_ids.items()})
print('nodes created: ', len(all_neighbors))
print('Finding all neighbors of each node...')
for i in tqdm(range(len(ub_edges))):
    all_neighbors[f'u{ub_edges[i][0]}'].append(f'b{ub_edges[i][1]}')
    all_neighbors[f'b{ub_edges[i][1]}'].append(f'r{ub_edges[i][0]}')
    
print('Total number of nodes: ', len(all_neighbors))

count = 0
for node,neighbors in all_neighbors.items():
    if not neighbors:
        count += 1
print(f'Number of nodes with no neighbors: {count}')

In [None]:
# step1: random walk with a restart (HAN paper method)

length = 100
prob_restart = 0.5
max_samples = {'b' : 25, 'u': 75} # should add up to length

random_walks = {}
for node in tqdm(list(all_neighbors.keys())):
    random_walks[node] = []
    curr_node = node
    neighbors = 0
    neigh_b = 0
    neigh_u = 0
    while neighbors < length:
        # print(f'finding {neighbors}th neighbor of {node}')
        p = random.random()
        if p < prob_restart:
            curr_node = node
        else:
            curr_node = random.choice(all_neighbors[curr_node])
            # if curr_node == node:
            #     curr_node = random.choice(all_neighbors[curr_node])
            if curr_node[0] == 'b' and neigh_b < max_samples['b']:
                random_walks[node].append(curr_node)
                neigh_b += 1
                neighbors += 1
            elif curr_node[0] == 'u' and neigh_u < max_samples['u']:
                random_walks[node].append(curr_node)
                neigh_u += 1
                neighbors += 1

print(f"number of random walks: {len(random_walks)}")
print(f"singular random walk, 'u0': {random_walks['u0']}")

In [None]:
# creating random walks file
with open(data_dir + 'random_walks.txt', 'w') as f:
    for key, value in tqdm(random_walks.items()):
        f.write(f"{key}:{','.join(value)}\n")

#### Grouping and finding top neighbors of each node type for all node

In [None]:
# step2: Grouping different types of neighbors based on frequency(HAN paper method)

top_k = {'b' : 5, 'u' : 15} # top k neighbors to be considered for each type of node
                                     # preferred to be less than the sample size in random walk for the optional last step

top_neighbors = {}
for node in tqdm(list(all_neighbors.keys())):
    
    # initializing top_neighbors categores for that node
    top_neighbors[node] = {'b' : [], 'u' : []}
    
    # finding neighbors of different types
    neigh_b = []
    neigh_u = []
    for neigh in random_walks[node]:
        if neigh[0] == 'b':
            neigh_b.append(neigh)
        elif neigh[0] == 'u':
            neigh_u.append(neigh)
    
    # finding top k neighbors (and their countes)
    top_b = Counter(neigh_b).most_common(top_k['b'])
    top_u = Counter(neigh_u).most_common(top_k['u'])
    
    # adding top k neighbors to top_neighbors in nodes respective category
    top_neighbors[node]['b'].extend([i[0] for i in top_b])
    top_neighbors[node]['u'].extend([i[0] for i in top_u])


    # adding random neighbors if less than top k
    if len(top_b) < top_k['b']:
        top_neighbors[node]['b'].extend(random.sample(neigh_b,top_k['b'] - len(top_b)))
    if len(top_u) < top_k['u']:
        top_neighbors[node]['u'].extend(random.sample(neigh_u,top_k['u'] - len(top_u)))
        
del neigh_b, neigh_u, top_b, top_u

In [None]:
# creating top neighbors file
with open(data_dir + 'node_neighbors_top.txt', 'w') as f:
    for key, value in tqdm(top_neighbors.items()):
        f.write(f"{key}:{','.join(value['b'])};{','.join(value['u'])};{','.join(value['r'])}\n")

## PyG HeteroData object creation

In [None]:
# TOtal number of business ids: 52244
# TOtal number of user ids: 1243707
# TOtal number of review ids: 3189729
# TOtal number of ur edges: 3189729
# TOtal number of rb edges: 3189729
# TOtal number of r samples with classes: 3189729


Nb = 52244 # business nodes
Nu = 1243707 # user nodes
Nr = 3189729 # review edges