In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from keras import layers

In [3]:
# Load the data and preprocess
businesses = pd.read_json('trainingdata\yelp_business.json', lines=True)
reviews = pd.read_json('trainingdata\yelp_review.json', lines=True)
users = pd.read_json('trainingdata\yelp_user.json', lines=True)
checkins = pd.read_json('trainingdata\yelp_checkin.json', lines=True)
tips = pd.read_json('trainingdata\yelp_tip.json', lines=True)
photos = pd.read_json('trainingdata\yelp_photo.json', lines=True)

In [4]:
# Merge the data
df = pd.merge(businesses, reviews, how='left', on='business_id')
df = pd.merge(df, users, how='left', on='business_id')
df = pd.merge(df, checkins, how='left', on='business_id')
df = pd.merge(df, tips, how='left', on='business_id')
df = pd.merge(df, photos, how='left', on='business_id')

# Remove unnecessary features
features_to_remove = ['address', 'attributes', 'business_id', 'categories', 'city', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'neighborhood', 'postal_code', 'state', 'time']
df.drop(features_to_remove, axis=1, inplace=True)

In [5]:
df.fillna({
    'weekday_checkins': 0,
    'weekend_checkins': 0,
    'average_tip_length': 0,
    'number_tips': 0,
    'average_caption_length': 0,
    'number_pics': 0
}, inplace=True)

# Split the data into features and labels
X = df.drop('stars', axis=1)  # Features
y = df['stars']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.4),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(1, activation='linear')
])

# Compile and train the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print('Mean Squared Error:', loss[0])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Squared Error: 0.28876805305480957


In [14]:
''' Input array should contain: 
'alcohol?' (0 or 1),), 
'has_bike_parking' (0 or 1),, 
'takes_credit_cards' (0 or 1), 
'good_for_kids' (0 or 1),, 
'take_reservations' (0 or 1),, 
'has_wifi' (0 or 1), 
'review_count' (int),, 
'price_range' (int),, 
'average_caption_length' (float), 
'number_pics' (int), 
'average_review_age' (float), 
'average_review_length' (float), 
'average_review_sentiment' (float), 
'number_funny_votes' (int),
'number_cool_votes' (int),
'number_useful_votes' (int),
'average_tip_length' (float),
'number_tips' (int),
'average_number_friends' (float), 
'average_days_on_yelp' (float),
'average_number_fans' (float),
'average_review_count' (float),
'average_number_years_elite' (float),
'weekday_checkins' (int),
'weekend_checkins' (int)
'''
input_data = np.array([0, 1, 1, 1, 1, 1, 10, 2, 3, 10, 10, 1200, 0.9, 2, 17, 20, 50, 3, 100, 1800, 15, 200, 2, 1, 1])
input_data = input_data.reshape(1, -1)
prediction = model.predict(input_data)
print(f'Predicted rating from', round(prediction[0][0],2), "to", round(prediction[0][0],0))

Predicted rating from 2.14 to 2.0


In [15]:
import pickle 
outfile = open("model_with_10epochs.pickle", "wb")
pickle.dump(model,outfile)
outfile.close()

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dropout
......vars
...layers\dropout_1
......vars
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-07-06 14:14:51         2123
metadata.json                                  2023-07-06 14:14:51           64
variables.h5                                   2023-07-06 14:14:51       165696
