In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]

In [None]:
df_business = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_review.json")
df_user = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_user.json")

In [None]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_weight'] = df_review['votes'].apply(sum_votes)
df_review['text_length'] = df_review['text'].apply(len)

In [None]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [None]:
df_review

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,split_text,votes_weight,text_length,evaluative_words,top_words_count
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,"[my, wife, took, me, here, on, my, birthday, f...",12,889,"[excellent, perfect, pleasure, excellent, like...",15
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,"[i, have, no, idea, why, some, people, give, b...",0,1345,"[bad, please, fault, like, friend, pretty, ple...",20
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,"[love, the, gyro, plate, rice, is, so, good, a...",2,76,"[love, good]",2
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,"[rosie, dakota, and, i, love, chaparral, dog, ...",5,419,"[love, wonderful, clean, huge, play]",5
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,"[general, manager, scott, petello, is, a, good...",0,469,"[good, assure, treat, respect, surprised, sati...",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229902,"{'funny': 0, 'useful': 0, 'cool': 0}",6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,"[i, really, wanted, to, like, this, place, bec...",0,939,"[like, honestly, bad, impressed, nice, relaxin...",10
229903,"{'funny': 0, 'useful': 2, 'cool': 0}",dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,"[my, husband, i, stayed, here, for, two, night...",4,831,"[ready, horrible, complain, like, stop, good, ...",6
229904,"{'funny': 0, 'useful': 0, 'cool': 0}",M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,"[cool, atmosphere, a, lot, of, beers, on, tap,...",0,124,"[cool, good, great]",3
229905,"{'funny': 1, 'useful': 2, 'cool': 0}",jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,"[i, have, to, take, a, star, off, for, the, sp...",5,420,"[irritated, like, disappoint]",1


In [None]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [None]:
# Calculate the total votes
df_user['votes_total'] = df_user['votes'].apply(lambda x: sum(x.values()))

# Calculate the ratio of votes to review_count
df_user['votes_per_review'] = df_user['votes_total'] / df_user['review_count']

# If division by 0
df_user['votes_per_review'] = df_user['votes_per_review'].replace([pd.NaT, pd.NaT], 0)

In [None]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'votes_weight', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id', 'votes_per_review']].copy()

merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')

In [None]:
merged_df

Unnamed: 0,user_id,business_id,stars,votes_weight,text_length,top_words_count,review_count,isOpen,votes_per_review,checkin_nums
0,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,12,889,15,116,1,4.486702,114
1,gXmtPKLWPZJeJX_KPw54HA,9yKzy9PApeiPPOUJEtnvkg,2,4,1116,9,116,1,1.767442,114
2,BvjSQAFcROLp27QjaRcyoA,9yKzy9PApeiPPOUJEtnvkg,2,3,669,8,116,1,1.105263,114
3,5qa1hx5GVHehlBQx0b5gFw,9yKzy9PApeiPPOUJEtnvkg,4,0,94,1,116,1,0.454545,114
4,nprSBcvBhvzyIbacEwzDLQ,9yKzy9PApeiPPOUJEtnvkg,5,0,427,10,116,1,1.038462,114
...,...,...,...,...,...,...,...,...,...,...
200468,3tYkSvQGRKFCZoARMtl68A,9xjzoqsGZ01kowWqBD7SNg,5,0,1132,9,3,1,1.200000,9
200469,2venYWARRoBdnfd3E32AFw,9xjzoqsGZ01kowWqBD7SNg,1,0,911,4,3,1,1.000000,9
200470,Mcdt7tUA7jqiqCtyxXgq4g,Gl0gUozT5jNi9Ar5LQAkFQ,2,0,1187,9,3,1,0.000000,7
200471,_lqsH9JNiAU3aZvKeMMnaw,Gl0gUozT5jNi9Ar5LQAkFQ,5,0,146,3,3,1,0.000000,7


In [None]:
# split the data
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

In [None]:
# select features and target
# Features (X)
features = train_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']]
# Target (y)
target = train_data['votes_weight']

In [None]:
# choose machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [None]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(features, target)

# Random Forest
rf_model = RandomForestRegressor()
rf_model.fit(features, target)

# Gradient Boosting
gb_model = GradientBoostingRegressor()
gb_model.fit(features, target)


In [None]:
# evaluate model performance
from sklearn.metrics import mean_squared_error

# Example with Linear Regression
linear_predictions = linear_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
linear_mse = mean_squared_error(test_data['votes_weight'], linear_predictions)
print(f"Linear Regression Mean Squared Error: {linear_mse}")

Linear Regression Mean Squared Error: 27.616509588576577


In [None]:
# Random Forest
rf_predictions = rf_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
rf_mse = mean_squared_error(test_data['votes_weight'], rf_predictions)
print(f"Random Forest Mean Squared Error: {rf_mse}")

Random Forest Mean Squared Error: 27.680450911584984


In [None]:
from sklearn.svm import SVR
svm_model = SVR()
svm_model.fit(features, target)
svm_predictions = svm_model.predict(test_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']])
svm_mse = mean_squared_error(test_data['votes_weight'], svm_predictions)
print(f"SVM Mean Squared Error: {svm_mse}")

SVM Mean Squared Error: 53.81878446225522


In [None]:
import pandas as pd

# 读取CSV文件
merged_df = pd.read_csv('/content/drive/MyDrive/ee562/mergerd_inner.csv')

In [None]:
# merged_df = pd.read_csv('merged_inner.csv')
merged_df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
def divide_series(series, n_groups):
    total_count = series.sum()
    target_per_group = total_count / n_groups

    groups = {i: [] for i in range(n_groups)}
    group_sums = [0] * n_groups
    current_group = 0

    for number, count in series.items():
        groups[current_group].append(number)
        group_sums[current_group] += count
        if group_sums[current_group] >= target_per_group and current_group < n_groups - 1:
            current_group += 1

    return {f"Group {i + 1}": {"Numbers": groups[i], "Sum": group_sums[i]} for i in range(n_groups)}

series = merged_df['votes_weight'].value_counts().sort_index()
groups = divide_series(series, 4)

for group_name, group_info in groups.items():
    print(f"{group_name}: Numbers: {group_info['Numbers']}, Sum = {group_info['Sum']}")

number_to_group = {num: i+1 for i, (group, info) in enumerate(groups.items()) for num in info["Numbers"]}
merged_df['votes_category'] = merged_df['votes_weight'].apply(lambda x: number_to_group.get(x))

Group 1: Numbers: [0], Sum = 72038
Group 2: Numbers: [1, 2, 3], Sum = 54846
Group 3: Numbers: [4, 5, 6, 7, 8, 9, 10], Sum = 52942
Group 4: Numbers: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 115, 116, 117, 118, 119, 120, 123, 125, 126, 129, 131, 133, 135, 141, 150, 154, 164, 171, 175, 187, 196, 198, 200, 202, 203, 241], Sum = 20647


In [None]:
merged_df

Unnamed: 0,user_id,business_id,stars,votes_weight,text_length,top_words_count,review_count,isOpen,votes_per_review,checkin_nums,votes_category
0,rLtl8ZkDX5vH5nAx9C3q5Q,9yKzy9PApeiPPOUJEtnvkg,5,12,889,15,116,1,4.486702,114,4
1,gXmtPKLWPZJeJX_KPw54HA,9yKzy9PApeiPPOUJEtnvkg,2,4,1116,9,116,1,1.767442,114,3
2,BvjSQAFcROLp27QjaRcyoA,9yKzy9PApeiPPOUJEtnvkg,2,3,669,8,116,1,1.105263,114,2
3,5qa1hx5GVHehlBQx0b5gFw,9yKzy9PApeiPPOUJEtnvkg,4,0,94,1,116,1,0.454545,114,1
4,nprSBcvBhvzyIbacEwzDLQ,9yKzy9PApeiPPOUJEtnvkg,5,0,427,10,116,1,1.038462,114,1
...,...,...,...,...,...,...,...,...,...,...,...
200468,3tYkSvQGRKFCZoARMtl68A,9xjzoqsGZ01kowWqBD7SNg,5,0,1132,9,3,1,1.200000,9,1
200469,2venYWARRoBdnfd3E32AFw,9xjzoqsGZ01kowWqBD7SNg,1,0,911,4,3,1,1.000000,9,1
200470,Mcdt7tUA7jqiqCtyxXgq4g,Gl0gUozT5jNi9Ar5LQAkFQ,2,0,1187,9,3,1,0.000000,7,1
200471,_lqsH9JNiAU3aZvKeMMnaw,Gl0gUozT5jNi9Ar5LQAkFQ,5,0,146,3,3,1,0.000000,7,1


In [None]:
!pip install xgboost




In [None]:
from sklearn.model_selection import train_test_split

X = merged_df[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']]  # Features
y = merged_df['votes_weight']  # Target variable


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import xgboost as xgb

# Create XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'eval_metric': 'rmsle',  # Root Mean Squared Logarithmic Error
    # Add other parameters as needed
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)


In [None]:
# Create XGBoost DMatrix for test set
dtest = xgb.DMatrix(X_test)

# Make predictions
predictions = model.predict(dtest)


In [None]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 5.25815316469066


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    # Add other parameters as needed
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                           param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=3,
                           verbose=1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_


Fitting 3 folds for each of 9 candidates, totalling 27 fits


KeyboardInterrupt: ignored

In [None]:
# Update the XGBoost model with best parameters
best_model = xgb.train(best_params, dtrain, num_boost_round=100)


In [None]:
predictions_by_best_model = best_model.predict(dtest)

In [None]:
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 5.25815316469066


In [None]:
# try the nerual network method
!pip install tensorflow




In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X1 = merged_df[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']]  # Features
y1 = merged_df['votes_weight']  # Target variable


# Split the data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)


# Standardize the data (optional but recommended for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train1)
X_test_scaled = scaler.transform(X_test1)

# Build the neural network model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train1.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Output layer for regression task
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the neural network model
model.fit(X_train_scaled, y_train1, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
predictions = model.predict(X_test_scaled)
rmsle = mean_squared_error(y_test1, predictions, squared=False)
print(f"Root Mean Squared Error: {rmsle}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Root Mean Squared Error: 5.184895865902915


In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=5)

In [None]:
knn_model.fit(X_train_scaled, y_train1)

In [None]:
predictions = knn_model.predict(X_test_scaled)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test1, predictions)
rmse = mean_squared_error(y_test1, predictions, squared=False)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

Mean Squared Error: 30.976315251278212
Root Mean Squared Error: 5.56563700319004
