Mounted at /content/drive


In [2]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]

In [5]:
df_business = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_review.json")
df_user = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_user.json")

In [6]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_weight'] = df_review['votes'].apply(sum_votes)
df_review['text_length'] = df_review['text'].apply(len)

In [7]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [8]:
df_review

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,split_text,votes_weight,text_length,evaluative_words,top_words_count
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,"[my, wife, took, me, here, on, my, birthday, f...",12,889,"[excellent, perfect, pleasure, excellent, like...",15
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,"[i, have, no, idea, why, some, people, give, b...",0,1345,"[bad, please, fault, like, friend, pretty, ple...",20
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,"[love, the, gyro, plate, rice, is, so, good, a...",2,76,"[love, good]",2
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,"[rosie, dakota, and, i, love, chaparral, dog, ...",5,419,"[love, wonderful, clean, huge, play]",5
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,"[general, manager, scott, petello, is, a, good...",0,469,"[good, assure, treat, respect, surprised, sati...",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229902,"{'funny': 0, 'useful': 0, 'cool': 0}",6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,"[i, really, wanted, to, like, this, place, bec...",0,939,"[like, honestly, bad, impressed, nice, relaxin...",10
229903,"{'funny': 0, 'useful': 2, 'cool': 0}",dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,"[my, husband, i, stayed, here, for, two, night...",4,831,"[ready, horrible, complain, like, stop, good, ...",6
229904,"{'funny': 0, 'useful': 0, 'cool': 0}",M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,"[cool, atmosphere, a, lot, of, beers, on, tap,...",0,124,"[cool, good, great]",3
229905,"{'funny': 1, 'useful': 2, 'cool': 0}",jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,"[i, have, to, take, a, star, off, for, the, sp...",5,420,"[irritated, like, disappoint]",1


In [9]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [10]:
# Calculate the total votes
df_user['votes_total'] = df_user['votes'].apply(lambda x: sum(x.values()))

# Calculate the ratio of votes to review_count
df_user['votes_per_review'] = df_user['votes_total'] / df_user['review_count']

# If division by 0
df_user['votes_per_review'] = df_user['votes_per_review'].replace([pd.NaT, pd.NaT], 0)

In [11]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'votes_weight', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id', 'votes_per_review']].copy()

merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')

In [12]:
merged_df

Unnamed: 0,user_id,business_id,stars,votes_weight,text_length,top_words_count,review_count,isOpen,votes_per_review,checkin_nums
0,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,12,889,15,116,1,4.486702,114
1,gXmtPKLWPZJeJX_KPw54HA,9yKzy9PApeiPPOUJEtnvkg,2,4,1116,9,116,1,1.767442,114
2,BvjSQAFcROLp27QjaRcyoA,9yKzy9PApeiPPOUJEtnvkg,2,3,669,8,116,1,1.105263,114
3,5qa1hx5GVHehlBQx0b5gFw,9yKzy9PApeiPPOUJEtnvkg,4,0,94,1,116,1,0.454545,114
4,nprSBcvBhvzyIbacEwzDLQ,9yKzy9PApeiPPOUJEtnvkg,5,0,427,10,116,1,1.038462,114
...,...,...,...,...,...,...,...,...,...,...
200468,3tYkSvQGRKFCZoARMtl68A,9xjzoqsGZ01kowWqBD7SNg,5,0,1132,9,3,1,1.200000,9
200469,2venYWARRoBdnfd3E32AFw,9xjzoqsGZ01kowWqBD7SNg,1,0,911,4,3,1,1.000000,9
200470,Mcdt7tUA7jqiqCtyxXgq4g,Gl0gUozT5jNi9Ar5LQAkFQ,2,0,1187,9,3,1,0.000000,7
200471,_lqsH9JNiAU3aZvKeMMnaw,Gl0gUozT5jNi9Ar5LQAkFQ,5,0,146,3,3,1,0.000000,7


In [13]:
# split the data
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

In [14]:
# select features and target
# Features (X)
features = train_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']]
# Target (y)
target = train_data['votes_weight']

In [15]:
# choose machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [16]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(features, target)

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(features, target)

# Gradient Boosting
gb_model = GradientBoostingRegressor()
gb_model.fit(features, target)


In [18]:
# evaluate model performance
from sklearn.metrics import mean_squared_error

# Example with Linear Regression
linear_predictions = linear_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
linear_mse = mean_squared_error(test_data['votes_weight'], linear_predictions)
print(f"Linear Regression Mean Squared Error: {linear_mse}")

Linear Regression Mean Squared Error: 27.616509588576577


In [20]:
# Random Forest
rf_predictions = rf_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
rf_mse = mean_squared_error(test_data['votes_weight'], rf_predictions)
print(f"Random Forest Mean Squared Error: {rf_mse}")

Random Forest Mean Squared Error: 27.680450911584984


In [19]:
from sklearn.svm import SVR
svm_model = SVR()
svm_model.fit(features, target)
svm_predictions = svm_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
svm_mse = mean_squared_error(test_data['votes_weight'], svm_predictions)
print(f"SVM Mean Squared Error: {svm_mse}")

SVM Mean Squared Error: 53.81878446225522
