In [7]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [8]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/xiaohanzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaohanzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xiaohanzhang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [9]:
# Function used to covert json to DataFrame
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

# Function used to remove extra apostrophes
def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

# Function used to regular expression and split review text
def Regular_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

#Funtion used to calculate the sum of votes on different wight
def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

# This class is used for sentiment analysis
sia = SentimentIntensityAnalyzer()
# Stop words are common words used in text processing (such as 'the', 'is', 'in', etc.)
stop_words = set(stopwords.words('english'))

# Function used to extract the evaluative words
def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]


In [10]:
# Covert json dataset to DataFrame
df_business = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_review.json")
df_user = read_json_to_dataframe("yelp-dataset/yelp_training_set/yelp_training_set_user.json")

In [11]:
# Process on the review dataset
df_review = Regular_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_weight'] = df_review['votes'].apply(sum_votes)
df_review['text_length'] = df_review['text'].apply(len)

In [12]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()

# Extract the top 100 common evaluative words
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [13]:
df_review

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,split_text,votes_weight,text_length,evaluative_words,top_words_count
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,"[my, wife, took, me, here, on, my, birthday, f...",12,889,"[excellent, perfect, pleasure, excellent, like...",15
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,"[i, have, no, idea, why, some, people, give, b...",0,1345,"[bad, please, fault, like, friend, pretty, ple...",20
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,"[love, the, gyro, plate, rice, is, so, good, a...",2,76,"[love, good]",2
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,"[rosie, dakota, and, i, love, chaparral, dog, ...",5,419,"[love, wonderful, clean, huge, play]",5
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,"[general, manager, scott, petello, is, a, good...",0,469,"[good, assure, treat, respect, surprised, sati...",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229902,"{'funny': 0, 'useful': 0, 'cool': 0}",6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,"[i, really, wanted, to, like, this, place, bec...",0,939,"[like, honestly, bad, impressed, nice, relaxin...",10
229903,"{'funny': 0, 'useful': 2, 'cool': 0}",dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,"[my, husband, i, stayed, here, for, two, night...",4,831,"[ready, horrible, complain, like, stop, good, ...",6
229904,"{'funny': 0, 'useful': 0, 'cool': 0}",M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,"[cool, atmosphere, a, lot, of, beers, on, tap,...",0,124,"[cool, good, great]",3
229905,"{'funny': 1, 'useful': 2, 'cool': 0}",jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,"[i, have, to, take, a, star, off, for, the, sp...",5,420,"[irritated, like, disappoint]",1


In [14]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [15]:
# Calculate the total votes
df_user['votes_total'] = df_user['votes'].apply(lambda x: sum(x.values()))

# Calculate the ratio of votes to review_count
df_user['votes_per_review'] = df_user['votes_total'] / df_user['review_count']

# If division by 0
df_user['votes_per_review'] = df_user['votes_per_review'].replace([pd.NaT, pd.NaT], 0)

In [16]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'votes_weight', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id', 'votes_per_review']].copy()

# Obtain a total dataset used for machine learning by merging above four dataset
merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')
#merged_df.fillna(0, inplace=True)

In [17]:
# Function used to determine votes weight distribution
def divide_series(series, n_groups):
    total_count = series.sum()
    target_per_group = total_count / n_groups

    groups = {i: [] for i in range(n_groups)}
    group_sums = [0] * n_groups
    current_group = 0

    for number, count in series.items():
        groups[current_group].append(number)
        group_sums[current_group] += count
        if group_sums[current_group] >= target_per_group and current_group < n_groups - 1:
            current_group += 1

    return {f"Group {i + 1}": {"Numbers": groups[i], "Sum": group_sums[i]} for i in range(n_groups)}

series = merged_df['votes_weight'].value_counts().sort_index()
groups = divide_series(series, 4)

for group_name, group_info in groups.items():
    print(f"{group_name}: Numbers: {group_info['Numbers']}, Sum = {group_info['Sum']}")

number_to_group = {num: i+1 for i, (group, info) in enumerate(groups.items()) for num in info["Numbers"]}
merged_df['votes_category'] = merged_df['votes_weight'].apply(lambda x: number_to_group.get(x))


Group 1: Numbers: [0], Sum = 72038
Group 2: Numbers: [1, 2, 3], Sum = 54846
Group 3: Numbers: [4, 5, 6, 7, 8, 9, 10], Sum = 52942
Group 4: Numbers: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 115, 116, 117, 118, 119, 120, 123, 125, 126, 129, 131, 133, 135, 141, 150, 154, 164, 171, 175, 187, 196, 198, 200, 202, 203, 241], Sum = 20647
