In [4]:
import pandas as pd
import json

# Load review data from JSON
with open('yelp_academic_dataset_review.json', 'r', encoding='utf-8') as f:
    review_data = [json.loads(line) for line in f]

# Create a DataFrame for reviews
full_review_df = pd.DataFrame(review_data)

# Display sample data
print(full_review_df.head())


                review_id                 user_id             business_id  \
0  KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1  BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2  saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3  AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4  Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   

   stars  useful  funny  cool  \
0    3.0       0      0     0   
1    5.0       1      0     1   
2    3.0       0      0     0   
3    5.0       1      0     1   
4    4.0       1      0     1   

                                                text                 date  
0  If you decide to eat here, just be aware it is...  2018-07-07 22:09:11  
1  I've taken a lot of spin classes over the year...  2012-01-03 15:28:18  
2  Family diner. Had the buffet. Eclectic assortm...  2014-02-05 20:30:30  
3  Wow!  Yummy, different,  delici

In [13]:
from surprise import KNNBasic
import numpy as np

class JaccardKNN(KNNBasic):
    def compute_similarities(self):
        """Override default similarity computation with Jaccard Similarity."""
        n_x, n_y = self.trainset.n_users, self.trainset.n_users
        sim = np.zeros((n_x, n_y))
        
        # Compute Jaccard similarity
        for i in range(n_x):
            for j in range(n_y):
                u_i = set([item for (item, _) in self.trainset.ur[i]])
                u_j = set([item for (item, _) in self.trainset.ur[j]])
                sim[i][j] = len(u_i & u_j) / len(u_i | u_j) if len(u_i | u_j) != 0 else 0
        return sim

In [5]:
# Load business data from JSON
with open('yelp_academic_dataset_business.json', 'r', encoding='utf-8') as f:
    business_data = [json.loads(line) for line in f]

# Create a DataFrame for businesses
full_business_df = pd.DataFrame(business_data)

# Display sample data
print(full_business_df.head())


              business_id                      name  \
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ   
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code  \
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101   
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open  \
0  34.426679 -119.711197    5.0             7        0   
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [None]:
def filter_reviews(review_df, business_df, 
                   cols: list = ['user_id', 'business_id', 'stars_review'],
                   num_samples: int = 100000):
    
    """
    Filters review data to Philadelphia businesses and selects a subset of columns
    Args:
        review_df (pd.DataFrame): DataFrame containing review data
        business_df (pd.DataFrame): DataFrame containing business data
        cols (list, optional): Columns to keep in output DataFrame. Defaults to ['user_id', 'business_id', 'stars_review']
        num_samples (int, optional): Number of random samples to return. If None, returns all filtered reviews

    Returns:
        pd.DataFrame: Filtered DataFrame containing only Philadelphia business reviews with specified columns
    """
        
    # First filter businesses to only Philadelphia
    phil_businesses = business_df[business_df['city'] == 'Philadelphia']

    # Merge with reviews to get only Philadelphia reviews
    filtered_reviews = pd.merge(review_df, phil_businesses, on='business_id', how='inner', suffixes=('_review', '_business'))
    
    if num_samples is None:
        return filtered_reviews[cols]
        
    return filtered_reviews.sample(n=num_samples, random_state=42)[cols]
review_df = filter_reviews(full_review_df, full_business_df, num_samples=100000)

In [None]:
from sklearn.model_selection import train_test_split

# Assuming review_df contains 'user_id', 'business_id', and 'stars' columns
train_data, test_data = train_test_split(review_df, test_size=0.1, random_state=42)

# Print sizes for confirmation
print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")

from surprise import Dataset, Reader, KNNBasic
from surprise import accuracy

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))

# Load training data into Surprise format
trainset = Dataset.load_from_df(train_data[['user_id', 'business_id', 'stars_review']], reader).build_full_trainset()

# Prepare the test set as a list of tuples (user_id, business_id, stars)
testset = list(test_data.itertuples(index=False, name=None))


Training data size: 90000
Test data size: 10000


In [14]:
# Initialize and train the User-User KNN model
# Train the Jaccard-based KNN model
jaccard_knn = JaccardKNN(k=20, sim_options={'user_based': True})  # Use top 20 similar users
jaccard_knn.fit(trainset)

# Make predictions on the test set
predictions = jaccard_knn.test(testset)

# Evaluate performance
print("Baseline Model Performance:")
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)


Baseline Model Performance:
RMSE: 1.4172
MAE:  1.1703


In [18]:
review_df = filter_reviews(full_review_df, full_business_df, num_samples=20000)
# Assuming review_df contains 'user_id', 'business_id', and 'stars' columns
train_data, test_data = train_test_split(review_df, test_size=0.1, random_state=42)

# Print sizes for confirmation
print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")

from surprise import Dataset, Reader, KNNBasic
from surprise import accuracy

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))

# Load training data into Surprise format
trainset = Dataset.load_from_df(train_data[['user_id', 'business_id', 'stars_review']], reader).build_full_trainset()

# Prepare the test set as a list of tuples (user_id, business_id, stars)
testset = list(test_data.itertuples(index=False, name=None))

# Initialize and train the User-User KNN model
# Train the Jaccard-based KNN model
jaccard_knn = JaccardKNN(k=10, sim_options={'user_based': True})  # Use top 20 similar users
jaccard_knn.fit(trainset)

# Make predictions on the test set
predictions = jaccard_knn.test(testset)

# Evaluate performance
print("Baseline Model Performance:")
mse = accuracy.mse(predictions)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

Training data size: 18000
Test data size: 2000
Baseline Model Performance:
MSE: 1.8271
RMSE: 1.3517
MAE:  1.1231
