### Imports

In [22]:
import pandas as pd
import json

from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

### Read and filter dataset

In [28]:
# Load review data
with open('yelp_academic_dataset_review.json', 'r', encoding='utf-8') as f:
    review_data = [json.loads(line) for line in f]
full_review_df = pd.DataFrame(review_data)

# Load review data
with open('yelp_academic_dataset_business.json', 'r', encoding='utf-8') as f:
    business_data = [json.loads(line) for line in f]
full_business_df = pd.DataFrame(business_data)

In [41]:
def filter_reviews(review_df, business_df, 
                   cols: list = ['user_id', 'business_id', 'stars_review'],
                   num_samples: int = 100000):
    
    """
    Filters review data to Philadelphia businesses and selects a subset of columns
    Args:
        review_df (pd.DataFrame): DataFrame containing review data
        business_df (pd.DataFrame): DataFrame containing business data
        cols (list, optional): Columns to keep in output DataFrame. Defaults to ['user_id', 'business_id', 'stars_review']
        num_samples (int, optional): Number of random samples to return. If None, returns all filtered reviews

    Returns:
        pd.DataFrame: Filtered DataFrame containing only Philadelphia business reviews with specified columns
    """
        
    # First filter businesses to only Philadelphia
    phil_businesses = business_df[business_df['city'] == 'Philadelphia']

    # Merge with reviews to get only Philadelphia reviews
    filtered_reviews = pd.merge(review_df, phil_businesses, on='business_id', how='inner', suffixes=('_review', '_business'))
    
    if num_samples is None:
        return filtered_reviews[cols]
        
    return filtered_reviews.sample(n=num_samples, random_state=42)[cols]
review_df = filter_reviews(full_review_df, full_business_df, num_samples=100000)

In [43]:
print(f"Shape of the dataset: {review_df.shape}")
review_df.head()

Shape of the dataset: (100000, 3)


Unnamed: 0,user_id,business_id,stars_review
941796,XrWAdRK4CUUK85Ak3x-HDw,h_rcMQtglIiAs-Oc5d8Ozw,4.0
406568,8nQXLLoCSzUvr5vrZOBWOw,QbJXsdhbZ8HQqjCpb76_FQ,1.0
423,JWeXuv2B9lRhiXBcIzsi2Q,7CXSQYrIep0jdvoYPUIGdQ,4.0
851186,_KIlU6g4QcmJY9Dr3dINeQ,iksVwRfpWymIUUFqw0tXpw,5.0
48162,xIfJQ8KT0xhsEco0IzHrSw,HMlqFPIHqiDV7qE1vREQXg,5.0


# SVD++

In [44]:
# Create the Dataset from the pandas df
reader = Reader(line_format='user item rating', rating_scale=(1 ,5))
data = Dataset.load_from_df(review_df[['user_id', 'business_id', 'stars_review']], reader=reader)

### Hyperparameter tuning

In [47]:
# Define the hyperparameters to try
param_grid = {
    'n_factors': [1, 5, 25],
    'n_epochs': [20, 30, 40],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1, 0.5]
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(SVDpp, param_grid, measures=['mse'], cv=5)
grid_search.fit(data)

# Evaluation of grid search
print(f"Best MSE score: {grid_search.best_score['mse']}")
print(f"Best parameters: {grid_search.best_params['mse']}")

Best MSE score: 1.6369837968636904
Best parameters: {'n_factors': 1, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.1}


### Training and Evaluation

In [51]:
# Split into train and test
trainset, testset = train_test_split(data, test_size=0.1)

# Train the SVD++ model on the training data
model = grid_search.best_estimator['mse']
model.fit(trainset)

# Evaluate model on the test set
predictions = model.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

RMSE: 1.2750
MAE:  1.0291
RMSE: 1.2749777599264966
MAE: 1.0291085285446566
