In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [3]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]

In [5]:
df_business = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_business.json")
df_checkin = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_checkin.json")
df_review = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_review.json")
df_user = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_training_set_user.json")

In [6]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['votes_weight'] = df_review['votes'].apply(sum_votes)
df_review['text_length'] = df_review['text'].apply(len)

In [7]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [8]:
df_review

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,split_text,votes_weight,text_length,evaluative_words,top_words_count
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,"[my, wife, took, me, here, on, my, birthday, f...",12,889,"[excellent, perfect, pleasure, excellent, like...",15
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,"[i, have, no, idea, why, some, people, give, b...",0,1345,"[bad, please, fault, like, friend, pretty, ple...",20
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,"[love, the, gyro, plate, rice, is, so, good, a...",2,76,"[love, good]",2
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,"[rosie, dakota, and, i, love, chaparral, dog, ...",5,419,"[love, wonderful, clean, huge, play]",5
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,"[general, manager, scott, petello, is, a, good...",0,469,"[good, assure, treat, respect, surprised, sati...",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229902,"{'funny': 0, 'useful': 0, 'cool': 0}",6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,"[i, really, wanted, to, like, this, place, bec...",0,939,"[like, honestly, bad, impressed, nice, relaxin...",10
229903,"{'funny': 0, 'useful': 2, 'cool': 0}",dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,"[my, husband, i, stayed, here, for, two, night...",4,831,"[ready, horrible, complain, like, stop, good, ...",6
229904,"{'funny': 0, 'useful': 0, 'cool': 0}",M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,"[cool, atmosphere, a, lot, of, beers, on, tap,...",0,124,"[cool, good, great]",3
229905,"{'funny': 1, 'useful': 2, 'cool': 0}",jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,"[i, have, to, take, a, star, off, for, the, sp...",5,420,"[irritated, like, disappoint]",1


In [9]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [10]:
# Calculate the total votes
df_user['votes_total'] = df_user['votes'].apply(lambda x: sum(x.values()))

# Calculate the ratio of votes to review_count
df_user['votes_per_review'] = df_user['votes_total'] / df_user['review_count']

# If division by 0
df_user['votes_per_review'] = df_user['votes_per_review'].replace([pd.NaT, pd.NaT], 0)

In [11]:
df_review

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id,split_text,votes_weight,text_length,evaluative_words,top_words_count
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg,"[my, wife, took, me, here, on, my, birthday, f...",12,889,"[excellent, perfect, pleasure, excellent, like...",15
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow,"[i, have, no, idea, why, some, people, give, b...",0,1345,"[bad, please, fault, like, friend, pretty, ple...",20
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA,"[love, the, gyro, plate, rice, is, so, good, a...",2,76,"[love, good]",2
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg,"[rosie, dakota, and, i, love, chaparral, dog, ...",5,419,"[love, wonderful, clean, huge, play]",5
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw,"[general, manager, scott, petello, is, a, good...",0,469,"[good, assure, treat, respect, surprised, sati...",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229902,"{'funny': 0, 'useful': 0, 'cool': 0}",6e7pZofhDuIlD_rX2oYirQ,f9JaiNg_FMoPNWxt7MlbZQ,2,2011-04-14,I really wanted to like this place because it'...,review,vnffHkFJbmd-J3OaBbK2Eg,"[i, really, wanted, to, like, this, place, bec...",0,939,"[like, honestly, bad, impressed, nice, relaxin...",10
229903,"{'funny': 0, 'useful': 2, 'cool': 0}",dDNfSFT0VApxPmURclX6_g,QDWRP1pW5r0huIBAoGmFyg,1,2011-01-23,My husband I stayed here for two nights. Of c...,review,l5oUrgQ190l8CcN8uzd_pA,"[my, husband, i, stayed, here, for, two, night...",4,831,"[ready, horrible, complain, like, stop, good, ...",6
229904,"{'funny': 0, 'useful': 0, 'cool': 0}",M5wHt6Odh1k5v0tIjqd8DQ,JmR3yk7JlS1LVVxtIc3xBQ,4,2010-10-11,Cool atmosphere. A lot of beers on tap and goo...,review,-EctXOb3B7T177jGYUhjVA,"[cool, atmosphere, a, lot, of, beers, on, tap,...",0,124,"[cool, good, great]",3
229905,"{'funny': 1, 'useful': 2, 'cool': 0}",jopndPrv-H5KW2CfScnw9A,z5b2p5TbCg0uaIiIe8n62w,3,2011-01-18,I have to take a star off for the spotty servi...,review,YQvg0JCGRFUkb6reMMf3Iw,"[i, have, to, take, a, star, off, for, the, sp...",5,420,"[irritated, like, disappoint]",1


In [12]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'votes_weight', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id', 'votes_per_review']].copy()

merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')

In [37]:
merged_df

NameError: ignored

In [4]:
import pandas as pd

In [42]:
merged_df.to_csv('/content/drive/MyDrive/ee562/merged_df_yuhua_train.csv', index=False)

NameError: ignored

In [43]:
merged_df_train = pd.read_csv('/content/drive/MyDrive/ee562/merged_df_yuhua_train.csv')

In [44]:
# split the data
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
train_data, test_data = train_test_split(merged_df_train, test_size=0.2, random_state=42)

In [45]:
# select features and target
# Features (X)
features = train_data[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'checkin_nums']]
# Target (y)
target = train_data['votes_weight']

#here is the training session for different models
##Model 1: xgboost

In [46]:
!pip install xgboost




In [47]:
from sklearn.model_selection import train_test_split

X = merged_df_train[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'votes_per_review', 'checkin_nums']]  # Features
y = merged_df_train['votes_weight']  # Target variable


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
import xgboost as xgb

# Create XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'eval_metric': 'rmsle',  # Root Mean Squared Logarithmic Error
    # Add other parameters as needed
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

In [49]:
# Create XGBoost DMatrix for test set
dtest = xgb.DMatrix(X_test)

# Make predictions
predictions = model.predict(dtest)

In [50]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE (Root Mean Squared Error)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"Root Mean Squared Error: {rmse}")


Root Mean Squared Error: 5.25815316469066


##The result of XGboost Model is accurate, the error is 5.26, a good result for regression problem in traditional machine learning models

## Model 2: Neural Network Model

In [51]:
# try the nerual network method
!pip install tensorflow



In [53]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

X1 = merged_df_train[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen',  'checkin_nums']]  # Features
y1 = merged_df_train['votes_weight']  # Target variable


# Split the data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)


# Standardize the data (optional but recommended for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train1)
X_test_scaled = scaler.transform(X_test1)

# Build the neural network model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train1.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)  # Output layer for regression task
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the neural network model
model.fit(X_train_scaled, y_train1, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
predictions = model.predict(X_test_scaled)
rmsle = mean_squared_error(y_test1, predictions, squared=False)
print(f"Root Mean Squared Error: {rmsle}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Root Mean Squared Error: 7.326054053521442


## result for nn model is slightly better than xgboost
The neural network has an input layer with as many neurons as there are features in the input data.
It contains one hidden layer with 64 neurons and another hidden layer with 32 neurons, both using the ReLU activation function.
The output layer has a single neuron for regression output, and it does not use any activation function.

# Model 3
## knn model

In [54]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=5)

In [55]:
knn_model.fit(X_train_scaled, y_train1)

In [56]:
predictions = knn_model.predict(X_test_scaled)

In [57]:
# Evaluate the model
mse = mean_squared_error(y_test1, predictions)
rmse = mean_squared_error(y_test1, predictions, squared=False)

In [58]:
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

Mean Squared Error: 63.84254994388328
Root Mean Squared Error: 7.990153311663255


## Knn model is slightly worse than the former models

# Test the models on test set

## first load the test set

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
import json
import string
import re
import pandas as pd
import numpy as np
from collections import Counter

In [61]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [62]:
def read_json_to_dataframe(file_path):
    try:
        df = pd.read_json(file_path, lines=True)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_apostrophes(series):
    return series.apply(lambda lst: [re.sub(r"'+", '', word) \
                                         if word.count("'") >= 2 else word for word in lst])

def De_symbolize_and_split(df, column_name, new_column_name, separator):
    df[new_column_name] = (df[column_name].str.lower()
                           .str.replace(r"[^a-zA-Z' ]", ' ', regex=True)
                           .str.replace(r'\s+', ' ', regex=True)
                           .str.strip()
                           .str.split(separator))
    return df

def sum_votes(vote_dict):
    weights = {'funny': 1, 'useful': 2, 'cool': 1}
    return sum(vote_dict.get(key, 0) * weight for key, weight in weights.items())

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def evaluative_words(words):
    return [word for word in words if word not in stop_words \
            and sia.polarity_scores(word)['compound'] != 0]

In [63]:
df_business = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_test_set_business.json")
df_checkin = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_test_set_checkin.json")
df_review = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_test_set_review.json")
df_user = read_json_to_dataframe("/content/drive/MyDrive/ee562/yelp_test_set_user.json")

In [64]:
df_review

Unnamed: 0,user_id,review_id,text,business_id,stars,date,type
0,2WkM3pYfx7bt46tv7u4hHA,1Lq3ghX6yzJ7MsWXDkgIGQ,"Nice place, big patio. Now offering LIVE sket...",AuMz7XGkjLcIUurp_AD51w,5,2010-11-15,review
1,eHWbF0k5QOBLgQXhGdeHmg,zulCqAo_XY9wAefJ58H9Fw,Friendly staff. Make sure you order the gyro p...,8i5hB_dmf33NVbWE5SwoMQ,5,2012-09-07,review
2,HrjjHfDGTafXyKpQKNrYHg,ptm1X6ReMEYg1Y203KXkxQ,LOVE LOVE LOVE this place for breakfast. They ...,nvaAUTTl7oqiJDhuimNG6A,5,2012-12-29,review
3,DrWLhrK8WMZf7Jb-Oqc7ww,JCCe3m0LK7dGGxO4ntoGLg,Disgusting sandwich. I should have known bette...,QwaoxP5Mgm3PJuZo_4bFsw,1,2013-02-22,review
4,jDCONTPR6nyc3J7iimwzkQ,CeOpuUly_s75QSuHKtfxng,Always a fan of Cafe Zupas and their very frie...,0lEp4vISRmOXa8Xz2pWhbw,4,2013-02-20,review
...,...,...,...,...,...,...,...
22951,UaIq_zJxnVxU_HfvC16i_A,xxt23yRZ-h95mNiu5SqNpA,For a wine bar I would have expected more by-t...,s1dex3Z3QoqiK7V-zXUgAw,4,2013-02-19,review
22952,XA_lIArLNis1tzrqSywzHQ,iuv2QctuvIPIXJwZoOgNMg,"This place has AWESOME kimchi Chigae, kalbi is...",PX3e3qtBx_5VC3vqq47jpQ,4,2012-09-24,review
22953,TVL1e0NHhxAE6PpeXGfTOA,CT9navwsc9EP2HeimvRpUg,LOVE this place! I recently did a search for ...,SyXz4OwSNxfbszTHP-d2aA,5,2013-02-05,review
22954,v7pgiW6IS8jSkQ2B4jeXxg,6en_lbX5obNzwIo06PnarA,Excellent! Inexpensive and hot fresh food. Wo...,soH8ekCER45AjfMaM6cJ2A,4,2011-06-28,review


In [65]:
df_review = De_symbolize_and_split(df_review, 'text', 'split_text',' ')
df_review['split_text'] = remove_apostrophes(df_review['split_text'])
df_review['text_length'] = df_review['text'].apply(len)

In [66]:
df_review['evaluative_words'] = df_review['split_text'].apply(evaluative_words)
df_words_counts = df_review["evaluative_words"].explode().value_counts()
top_words = set(df_words_counts.head(100).index.tolist())
df_review['top_words_count'] = df_review['evaluative_words'] \
    .apply(lambda words: sum(word in top_words for word in words) if isinstance(words, list) else 0)

In [67]:
#Convert to numeric representation
df_business['isOpen'] = df_business['open'].astype(int)

#Count the total number of checkin days
df_checkin['checkin_nums'] = df_checkin['checkin_info'].apply(lambda x: sum(x.values()))

In [68]:
df_review

Unnamed: 0,user_id,review_id,text,business_id,stars,date,type,split_text,text_length,evaluative_words,top_words_count
0,2WkM3pYfx7bt46tv7u4hHA,1Lq3ghX6yzJ7MsWXDkgIGQ,"Nice place, big patio. Now offering LIVE sket...",AuMz7XGkjLcIUurp_AD51w,5,2010-11-15,review,"[nice, place, big, patio, now, offering, live,...",382,"[nice, comedy, holiday, holiday, easily, offen...",2
1,eHWbF0k5QOBLgQXhGdeHmg,zulCqAo_XY9wAefJ58H9Fw,Friendly staff. Make sure you order the gyro p...,8i5hB_dmf33NVbWE5SwoMQ,5,2012-09-07,review,"[friendly, staff, make, sure, you, order, the,...",75,"[friendly, sure]",2
2,HrjjHfDGTafXyKpQKNrYHg,ptm1X6ReMEYg1Y203KXkxQ,LOVE LOVE LOVE this place for breakfast. They ...,nvaAUTTl7oqiJDhuimNG6A,5,2012-12-29,review,"[love, love, love, this, place, for, breakfast...",164,"[love, love, love, yummy]",4
3,DrWLhrK8WMZf7Jb-Oqc7ww,JCCe3m0LK7dGGxO4ntoGLg,Disgusting sandwich. I should have known bette...,QwaoxP5Mgm3PJuZo_4bFsw,1,2013-02-22,review,"[disgusting, sandwich, i, should, have, known,...",1853,"[disgusting, better, like, cutting, loyal, fan...",18
4,jDCONTPR6nyc3J7iimwzkQ,CeOpuUly_s75QSuHKtfxng,Always a fan of Cafe Zupas and their very frie...,0lEp4vISRmOXa8Xz2pWhbw,4,2013-02-20,review,"[always, a, fan, of, cafe, zupas, and, their, ...",459,"[fan, friendly, amazing, great, delicious, wan...",8
...,...,...,...,...,...,...,...,...,...,...,...
22951,UaIq_zJxnVxU_HfvC16i_A,xxt23yRZ-h95mNiu5SqNpA,For a wine bar I would have expected more by-t...,s1dex3Z3QoqiK7V-zXUgAw,4,2013-02-19,review,"[for, a, wine, bar, i, would, have, expected, ...",279,"[great, special, hesitation, good, trusts, sure]",4
22952,XA_lIArLNis1tzrqSywzHQ,iuv2QctuvIPIXJwZoOgNMg,"This place has AWESOME kimchi Chigae, kalbi is...",PX3e3qtBx_5VC3vqq47jpQ,4,2012-09-24,review,"[this, place, has, awesome, kimchi, chigae, ka...",167,"[awesome, ok, miss, best]",4
22953,TVL1e0NHhxAE6PpeXGfTOA,CT9navwsc9EP2HeimvRpUg,LOVE this place! I recently did a search for ...,SyXz4OwSNxfbszTHP-d2aA,5,2013-02-05,review,"[love, this, place, i, recently, did, a, searc...",657,"[love, excellent, encouraged, glad, good, inti...",7
22954,v7pgiW6IS8jSkQ2B4jeXxg,6en_lbX5obNzwIo06PnarA,Excellent! Inexpensive and hot fresh food. Wo...,soH8ekCER45AjfMaM6cJ2A,4,2011-06-28,review,"[excellent, inexpensive, and, hot, fresh, food...",67,"[excellent, fresh, worth]",3


In [69]:
temp_review = df_review[['user_id', 'business_id', 'stars', 'text_length', 'top_words_count']].copy()
temp_business = df_business[['business_id', 'review_count', 'isOpen']].copy()
temp_checkin = df_checkin[['business_id', 'checkin_nums']].copy()
temp_user = df_user[['user_id']].copy()

merged_df = pd.merge(temp_review, temp_business, on = 'business_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_user, on = 'user_id', how = 'inner')
merged_df = pd.merge(merged_df, temp_checkin, on = 'business_id', how = 'inner')

In [70]:
merged_df

Unnamed: 0,user_id,business_id,stars,text_length,top_words_count,review_count,isOpen,checkin_nums
0,eHWbF0k5QOBLgQXhGdeHmg,8i5hB_dmf33NVbWE5SwoMQ,5,75,2,21,1,51
1,ROJ2hXmcQc_r5ZEek1hU6w,8i5hB_dmf33NVbWE5SwoMQ,5,525,8,21,1,51
2,T-9NA7VJmZoNhY8pTDUdHg,8i5hB_dmf33NVbWE5SwoMQ,4,156,5,21,1,51
3,FmW2nQXyzmrz3EJzQ6v7lg,8i5hB_dmf33NVbWE5SwoMQ,3,1721,12,21,1,51
4,1zIIIiVOZCWkI_DJ6Pz45Q,b-t3nNmRw6alKw4aSqAgPg,1,192,3,37,1,126
...,...,...,...,...,...,...,...,...
1648,WdpjLmfyOOZ8Az11WBPqkA,n8xZIU1PZz3Um0QhJnCfcw,2,357,1,3,1,33
1649,WiEK-GiAAvqD_UYy0aa7Sg,ARsdp6eSkfPnKlPaHcv-Lg,2,1877,8,3,1,17
1650,ushY4-xrl8uRyhrYKZabPQ,UiJajPGYV0CMI_2rV0WSRA,1,456,5,3,1,8
1651,j-sI9L9W8tnNfdDMxAWjBg,hYooRvt98mjOun91EyKPNg,4,45,0,3,1,14


In [71]:
# split the data
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
train_data, test_data = train_test_split(merged_df, test_size=0.2, random_state=42)

In [73]:
X_test_scaled = scaler.transform(merged_df[['stars', 'text_length', 'top_words_count', 'review_count', 'isOpen', 'checkin_nums']])


In [74]:
predictions = model.predict(X_test_scaled)




In [77]:
print(predictions)

[[1.3642898]
 [3.7080479]
 [1.4447943]
 ...
 [3.2315583]
 [1.7014813]
 [3.1379821]]
