In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

In [83]:
# If you haven't downloaded stopwords from nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aoyanliang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aoyanliang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# <font color='red'>FEATURE PROCESSING</font>

In [2]:
# 1. Load the JSON data into a DataFrame
user_df = pd.read_json('data/user.json', lines=True)
business_df = pd.read_json('data/business.json', lines=True)
review_df = pd.read_json('data/review_train.json', lines=True)

In [3]:
user_df['user_id'] = user_df['user_id'].astype(str) + '-u'
business_df['business_id'] = business_df['business_id'].astype(str) + '-b'

In [4]:
# 2. Inspect the DataFrame
print(user_df.info())
print(business_df.info())
print(review_df.info())
# 3. Look at the first few rows
# print(df.head())

# 4. Statistical summary
#print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1518169 entries, 0 to 1518168
Data columns (total 22 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   user_id             1518169 non-null  object 
 1   name                1518169 non-null  object 
 2   review_count        1518169 non-null  int64  
 3   yelping_since       1518169 non-null  object 
 4   friends             1518169 non-null  object 
 5   useful              1518169 non-null  int64  
 6   funny               1518169 non-null  int64  
 7   cool                1518169 non-null  int64  
 8   fans                1518169 non-null  int64  
 9   elite               1518169 non-null  object 
 10  average_stars       1518169 non-null  float64
 11  compliment_hot      1518169 non-null  int64  
 12  compliment_more     1518169 non-null  int64  
 13  compliment_profile  1518169 non-null  int64  
 14  compliment_cute     1518169 non-null  int64  
 15  compliment_list

In [24]:
# Normalize the city names by stripping leading/trailing spaces and converting to lowercase
normalized_cities = business_df['neighborhood'].str.strip().str.lower().unique()

# Now calculate the number of unique cities
num_normalized_unique_cities = len(normalized_cities)

print(num_normalized_unique_cities)

388


## review text preprocessing

In [305]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

review_df['cleaned_text'] = review_df['text'].apply(clean_text)

In [253]:
# Train Word2Vec Model

# List of list of tokens is needed for Word2Vec training
sentences = review_df['cleaned_text'].tolist()

# Train the Word2Vec model
model_w2v = Word2Vec(sentences, size=10, window=5, min_count=1, workers=4)
model_w2v.save("./models/word2vec-10.model")

In [306]:
model_w2v = Word2Vec.load("./models/word2vec-10.model")

def get_vector(word_list, model):
    # Retrieve vector for each word and take the mean
    vector_list = [model_w2v.wv[word] for word in word_list if word in model_w2v.wv.vocab]
    vector = np.mean(vector_list, axis=0)
    return vector

review_df['text_vector'] = review_df['cleaned_text'].apply(lambda x: get_vector(x, model_w2v))

  out=out, **kwargs)


In [307]:
# Here, we're using mean for aggregation
def vector_mean(vectors):
    return np.mean(vectors, axis=0).tolist()

user_review_aggregations = review_df.groupby('user_id').agg({
    #'stars': 'mean',
    #'useful': 'mean',
    #'funny': 'mean',
    #'cool': 'mean',
    'text_vector': vector_mean
}).reset_index()

business_review_aggregations = review_df.groupby('business_id').agg({
    #'stars': 'mean',
    #'useful': 'mean',
    #'funny': 'mean',
    #'cool': 'mean',
    'text_vector': vector_mean
}).reset_index()

In [318]:
# Save these features
user_review_aggregations.to_csv('./embeded_features/user_review_aggregations.csv', index=False)
business_review_aggregations.to_csv('./embeded_features/business_review_aggregations.csv', index=False)

## Graph embedding

### Create User-User Edges DataFrame

In [6]:
# Assuming 'None' is used when there are no friends listed
user_df['friends'] = user_df['friends'].fillna('None')

# Create user-user edges
user_user_edges = (
    user_df[user_df['friends'] != 'None']
    .assign(friends=lambda df: df['friends'].str.split(', '))
    .explode('friends')[['user_id', 'friends']]  # Select only the relevant columns
    .dropna()  # Drop any rows that have NaN values after explode
    .rename(columns={'user_id': 'user_id', 'friends': 'friend_id'})
    .reset_index(drop=True)  # Reset the index, dropping the old one
)

# Append '-u' to each user ID
user_user_edges['user_id'] = user_user_edges['user_id'].astype(str) + '-u'
user_user_edges['friend_id'] = user_user_edges['friend_id'].astype(str) + '-u'

# Now, user_user_edges contains only 'source' and 'target' columns

user_user_edges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64886928 entries, 0 to 64886927
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   user_id    object
 1   friend_id  object
dtypes: object(2)
memory usage: 990.1+ MB


In [8]:
# Add an edge type column
user_user_edges['edge_type'] = 'friendship'

# Save to TSV
user_user_edges.to_csv('graph_edges/user_user_edges_relation.tsv', sep='\t', index=False, header=False)

### Create User-Business Edges DataFrame

In [3]:
# Create user-business edges
user_business_edges = review_df[['user_id', 'business_id']].copy()
user_business_edges['user_id'] = user_business_edges['user_id'].astype(str) + '-u'
user_business_edges['business_id'] = user_business_edges['business_id'].astype(str) + '-b'
user_business_edges = user_business_edges.rename(columns={'user_id': 'user_id', 'business_id': 'business_id'})

user_business_edges

Unnamed: 0,user_id,business_id
0,YHWsLBS8jzZiPjKHMFOaAA-u,iKMLsX1Je7P3wAOEc9scDg-b
1,YHWsLBS8jzZiPjKHMFOaAA-u,qhJ4GDULYbdb_sctDgbZgw-b
2,YHWsLBS8jzZiPjKHMFOaAA-u,gl1zQmiA8MUHmLL2wsCdVA-b
3,YHWsLBS8jzZiPjKHMFOaAA-u,OR6iRk0vrMzE-1gLg-WYrw-b
4,YHWsLBS8jzZiPjKHMFOaAA-u,HOGdBz2w9VZbw2yyM-WA3g-b
...,...,...
455850,3LC2sKfvz_nWuJquUDsNnw-u,t67GMPZ0cv_ItKlID-JFiQ-b
455851,3LC2sKfvz_nWuJquUDsNnw-u,wsmVIHJEi9J_38dXx2qLKA-b
455852,3LC2sKfvz_nWuJquUDsNnw-u,j1i7s55PmOFzJC3l6O8PiA-b
455853,3LC2sKfvz_nWuJquUDsNnw-u,zjwdU1OdlbKTGjm-IfD4TQ-b


In [4]:
# Add an edge type column
user_business_edges['edge_type'] = 'reviewed'

# Save to TSV
user_business_edges.to_csv('graph_edges/user_business_edges_relation_directed.tsv', sep='\t', index=False, header=False)

### Create Business-Category Edges DataFrame

In [6]:
# Create business-category edges
business_category_edges = (
    business_df.dropna(subset=['categories'])
    .assign(categories=lambda df: df['categories'].str.split(', '))
    .explode('categories')[['business_id', 'categories']]  # Select only the relevant columns
    .dropna()  # Drop any rows that have NaN values after explode
    .rename(columns={'business_id': 'source', 'categories': 'target'})
    .reset_index(drop=True)
)

business_category_edges['source'] = business_category_edges['source'].astype(str) + '-b'
business_category_edges['target'] = business_category_edges['target'].str.replace(' ', '_').astype(str) + '-c'

business_category_edges

Unnamed: 0,source,target
0,Apn5Q_b6Nz61Tq4XzPdf9A-b,Tours-c
1,Apn5Q_b6Nz61Tq4XzPdf9A-b,Breweries-c
2,Apn5Q_b6Nz61Tq4XzPdf9A-b,Pizza-c
3,Apn5Q_b6Nz61Tq4XzPdf9A-b,Restaurants-c
4,Apn5Q_b6Nz61Tq4XzPdf9A-b,Food-c
...,...,...
739017,NkOvIueadjFUxeCyq_uQEw-b,Shopping-c
739018,NkOvIueadjFUxeCyq_uQEw-b,Hair_Salons-c
739019,NkOvIueadjFUxeCyq_uQEw-b,Fashion-c
739020,NkOvIueadjFUxeCyq_uQEw-b,Hair_Stylists-c


In [7]:
# Add an edge type column
business_category_edges['edge_type'] = 'belongs'

# Save to TSV
business_category_edges.to_csv('graph_edges/business_category_edges_relation_directed.tsv', sep='\t', index=False, header=False)

### Create Business-City Edges DataFrame

In [3]:
# Copy relevant columns to create business-city edges DataFrame
business_city_edges = business_df[['business_id', 'city']].copy()

# Append suffixes to identifiers in the new DataFrame
business_city_edges['business_id'] = business_city_edges['business_id'].astype(str) + '-b'
business_city_edges['city'] = business_city_edges['city'].astype(str) + '-ct'

# Perform the cleaning and filtering steps
# Trim leading/trailing spaces and convert to lowercase for city
business_city_edges['city'] = business_city_edges['city'].str.strip().str.lower().str.replace(' ', '_')

# Filter out rows where the city is an empty string or only the suffix
business_city_edges = business_city_edges[business_city_edges['city'] != '-cy']
business_city_edges = business_city_edges.reset_index(drop=True)
business_city_edges

Unnamed: 0,business_id,city
0,Apn5Q_b6Nz61Tq4XzPdf9A-b,calgary-ct
1,AjEbIBw6ZFfln7ePHha9PA-b,henderson-ct
2,O8S5hYJ1SMc8fA4QBtVujA-b,montréal-ct
3,bFzdJJ3wp3PZssNEsyU23g-b,phoenix-ct
4,8USyCYqpScwiNEb58Bt6CA-b,calgary-ct
...,...,...
188588,sMQAZ3DkfrURFoJAyOhjEw-b,pittsburgh-ct
188589,6hvuCibNS4uECetHb9MCQQ-b,pittsburgh-ct
188590,KleCXFYOmdACcQUvf6_XEg-b,concord-ct
188591,3_fIsSxN2RBovQ_6EFtLzA-b,concord-ct


In [4]:
# Add an edge type column
business_city_edges['edge_type'] = 'in'

# Save to TSV
business_city_edges.to_csv('graph_edges/business_city_edges_relation_directed.tsv', sep='\t', index=False, header=False)

### Separate embedding files from Pytorch-Biggraph

In [3]:
# finished PBG embedding on CARC using 1 node with 64 core in ~30 mins for 10 epoch
# 1 hour 30 mins for 100 epoch
# 2 hour 30 mins for 200 epoch

# Path to the large embedding file
embedding_file_path = './embeded_features/PBG/epoch200/ubcct_embeddings-epoch200.tsv'

# Paths to the output files for each node type
output_files = {
    'user': './embeded_features/PBG/epoch200/user_embeddings.tsv',
    'business': './embeded_features/PBG/epoch200/business_embeddings.tsv',
    'category': './embeded_features/PBG/epoch200/category_embeddings.tsv',
    'city': './embeded_features/PBG/epoch200/city_embeddings.tsv'
}

# Function to separate the embeddings based on the node type suffixes
def separate_embeddings(embedding_file, output_files):
    # Open all output files in write mode
    with open(output_files['user'], 'w') as user_file, \
         open(output_files['business'], 'w') as business_file, \
         open(output_files['category'], 'w') as category_file, \
         open(output_files['city'], 'w') as city_file:

        # Open the large embedding file and process line by line
        with open(embedding_file, 'r') as file:
            line_count = 0
            for line in file:
                line_count += 1
                node_id = line.split('\t')[0]

                # Determine the node type and write to the corresponding file
                if node_id.endswith('-u'):
                    user_file.write(line)
                elif node_id.endswith('-b'):
                    business_file.write(line)
                elif node_id.endswith('-c'):
                    category_file.write(line)
                elif node_id.endswith('-ct'):
                    city_file.write(line)
                    
                # Print progress every 100000 lines
                if line_count % 100000 == 0:
                    print(f"Processed {line_count} lines")
            print(f"Total: {line_count} lines")

# Run the function
separate_embeddings(embedding_file_path, output_files)


Processed 100000 lines
Processed 200000 lines
Processed 300000 lines
Processed 400000 lines
Processed 500000 lines
Processed 600000 lines
Processed 700000 lines
Processed 800000 lines
Processed 900000 lines
Processed 1000000 lines
Processed 1100000 lines
Processed 1200000 lines
Processed 1300000 lines
Processed 1400000 lines
Processed 1500000 lines
Processed 1600000 lines
Processed 1700000 lines
Processed 1800000 lines
Processed 1900000 lines
Processed 2000000 lines
Processed 2100000 lines
Processed 2200000 lines
Processed 2300000 lines
Processed 2400000 lines
Processed 2500000 lines
Processed 2600000 lines
Processed 2700000 lines
Processed 2800000 lines
Processed 2900000 lines
Processed 3000000 lines
Processed 3100000 lines
Processed 3200000 lines
Processed 3300000 lines
Processed 3400000 lines
Processed 3500000 lines
Processed 3600000 lines
Processed 3700000 lines
Processed 3800000 lines
Processed 3900000 lines
Processed 4000000 lines
Processed 4100000 lines
Processed 4200000 lines
P