In [4]:
import pandas as pd
import json

directory = "../data/"

# Load User data
with open(directory+'yelp_academic_dataset_user.json', 'r', encoding='utf-8') as f:
    user_data = [json.loads(line) for line in f]
user_df = pd.DataFrame(user_data)

# Load Tip data
with open(directory+'yelp_academic_dataset_tip.json', 'r', encoding='utf-8') as f:
    tip_data = [json.loads(line) for line in f]
tip_df = pd.DataFrame(tip_data)

# Load business data
with open(directory+'yelp_academic_dataset_business.json', 'r', encoding='utf-8') as f:
    business_data = [json.loads(line) for line in f]
business_df = pd.DataFrame(business_data)

# Load review data
with open(directory+'yelp_academic_dataset_review.json', 'r', encoding='utf-8') as f:
    review_data = [json.loads(line) for line in f]
review_df = pd.DataFrame(review_data)

# Load check-in data
with open(directory+'yelp_academic_dataset_checkin.json', 'r', encoding='utf-8') as f:
    checkin_data = [json.loads(line) for line in f]
checkin_df = pd.DataFrame(checkin_data)

In [5]:
def filter_reviews(review_df, business_df, 
                   cols: list = ['user_id', 'business_id', 'stars_review'],
                   num_samples: int = 100000):
    
    """
    Filters review data to Philadelphia businesses and selects a subset of columns
    Args:
        review_df (pd.DataFrame): DataFrame containing review data
        business_df (pd.DataFrame): DataFrame containing business data
        cols (list, optional): Columns to keep in output DataFrame. Defaults to ['user_id', 'business_id', 'stars_review']
        num_samples (int, optional): Number of random samples to return. If None, returns all filtered reviews

    Returns:
        pd.DataFrame: Filtered DataFrame containing only Philadelphia business reviews with specified columns
    """
        
    # First filter businesses to only Philadelphia
    phil_businesses = business_df[business_df['city'] == 'Philadelphia']

    cols_to_use = phil_businesses.columns.difference(review_df.columns).tolist()
    if 'business_id' not in cols_to_use:
        cols_to_use.append('business_id')
        
    # Merge with reviews to get only Philadelphia reviews
    filtered_reviews = pd.merge(review_df, phil_businesses[cols_to_use], on='business_id', how='inner')
    
    if num_samples is None:
        return filtered_reviews[cols]
        
    return filtered_reviews.sample(n=num_samples, random_state=42)

review_df = filter_reviews(review_df, business_df, num_samples=100000)

phil_reviews_df = filter_reviews(review_df, business_df)

pd.set_option('display.max_colwidth', None)
phil_reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,address,...,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,state
75721,C6QNE-9zjH8pfF1F_SxX6Q,eqR13jEqW9bqVOl1mAKN5Q,BaSwNEingTmrBw4shffK5w,3.0,2,0,0,"The coffee was fine but the environment struck me as more pretentious than it was worth. Maybe I'm just not appreciative of the subtleties of coffee as I should be, but I'll stick with black and brew.",2017-04-02 20:40:50,1001 S 10th St,...,"Bakeries, Restaurants, Breakfast & Brunch, Desserts, Food, Coffee Roasteries, Coffee & Tea",Philadelphia,"{'Monday': '9:0-16:0', 'Tuesday': '7:30-16:0', 'Wednesday': '7:30-16:0', 'Thursday': '7:30-16:0', 'Friday': '7:30-16:0', 'Saturday': '7:30-16:0', 'Sunday': '8:0-16:0'}",1,39.937838,-75.159481,Function Coffee Labs,19147,141,PA
80184,kFgq0pQmwFocl4JhhU-Isg,rWSHfLxj_Zd3eu9Fgiotag,F8yozE3NWnImNApHO347gQ,4.0,5,0,1,"Marrakesh is a unique dining experience hidden away in a South Philadelphia alleyway. \n\nAmbiance:\nMarrakesh is all about the experience. You start by walking down an alley off of South Street to ring the doorbell. You are greeted by one of the waiters and instantly walk into a dimly lit Moroccan evening. The tables are low and rapped by benches instead of chairs. The walls and floors are draped in rugs and a belly dancer is performing on one of the many levels at any given time. The dining room is organized into several different rooms across the levels so, while tightly packed, every dinner feels intimate. Marrakesh is best enjoyed as a group, as tables seem to be made for 4-15 diners rather than one or two. \n\nFood:\nThe food here is good. Every diner enjoys a tasting menu that starts with an appetizer of different types of moroccan salad, is followed by a chicken course, and then a choice between beef and lamb. The meal finishes with a couscous course, a fruit course, and mint tea accompanied by Baklava. The staff is able to make the main courses spicy or non-spicy and vegetarian options are available - even if its just one person out of the group. All of the dishes were good and definitely had the punch of flavor I'd expect from Moroccan food. \n\nDrink:\nThis restaurant is BYOB. They seem to sell some drinks from some of the other reviews, however its not well advertised and I was unclear of what they offered so I brought my own. $3 per person uncorking fee. \n\nService:\nThe staff definitely tries to accomodate everyone and provide a great experience. . My biggest complaint is that they don't offer silverware! Save for the couscous, you are forced to eat every dish with your hands or pita bread - no wonder they give you a bath towel instead of a napkin! When we asked for a fork and knife for our chicken, they informed us that they did not have any. Marrakesh should at least keep some on hand for those diners that are less adventurous about eating with their hands. \n\nValue:\nYou get a lot of bang for your buck at Marrakesh. For only $35 a person (gratuity included), you leave having tasted a wide range of Moroccan food and feeling stuffed. Absolutely recommend you go here at least once for the experience - but come hungry!",2018-01-18 18:03:14,517 S Leithgow St,...,"Restaurants, Mediterranean, Moroccan",Philadelphia,"{'Monday': '17:30-21:0', 'Tuesday': '17:30-21:0', 'Wednesday': '17:30-21:0', 'Thursday': '17:30-21:0', 'Friday': '17:30-21:0', 'Saturday': '17:30-21:0', 'Sunday': '17:30-21:0'}",1,39.942044,-75.14951,Marrakesh,19147,659,PA
19864,genBlG-tO-92_MAdxCXTWg,PHE_aNFaSywCBYBF4bLTpQ,CFlW9bfl4N63fZpQsZIkRQ,5.0,0,0,0,"Our go to place to stay whenever we go to the city. Staff is friendly, there is great service. The food is really good as well. The location of it is great, and I definitely recommend staying here soon.",2019-08-16 03:28:15,1 Logan Sq,...,"Hotels, Event Planning & Services, Nightlife, Hotels & Travel, Bars, Venues & Event Spaces",Philadelphia,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}",1,39.956932,-75.170271,"The Logan Philadelphia, Curio Collection by Hilton",19103,211,PA
76699,mYVO7CidCIVXww6g8H91vg,rDpysBqhldE0poFgF_DX8w,y1Z9tymuBGVDZnYZoLk-2Q,5.0,0,0,1,"I'm going to keep this shirt and sweet. I had the $22 hamburger, and it's totally worth $23. Off the hook... prepared, assembled and presented with nothing but great taste. Fries match to perfection. Don't hesitate.",2021-10-17 13:37:29,306 Market St,...,"Burgers, American (New), Restaurants, Breakfast & Brunch",Philadelphia,"{'Wednesday': '11:0-20:30', 'Thursday': '11:0-20:30', 'Friday': '11:0-21:30', 'Saturday': '11:0-21:30', 'Sunday': '11:0-20:30'}",1,39.950001,-75.146124,Fork,19106,640,PA
92991,AIM0PhaDoQgXcBbK_8wMGA,udV_Z9K_GN2sl-ivPT2cow,x5GkG0oI_S107wLlgdBxVA,5.0,0,0,0,"Amazing food, and super cool vibe. I came here with a friend who was visiting from out of state, and she loved it! Definitely will be visiting again!",2017-05-24 18:50:00,3602 Chestnut St,...,"Nightlife, Beer Bar, Breakfast & Brunch, Bagels, Restaurants, Bars, Coffee & Tea, Food",Philadelphia,"{'Monday': '7:0-15:0', 'Tuesday': '7:0-15:0', 'Wednesday': '7:0-15:0', 'Thursday': '7:0-15:0', 'Friday': '7:0-15:0', 'Saturday': '7:0-15:0', 'Sunday': '7:0-15:0'}",1,39.954537,-75.194396,Spread Bagelry,19104,203,PA


In [6]:
phil_reviews_df.to_pickle('../data/phil_reviews.csv')

In [5]:
phil_reviews_df.columns.values

array(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date', 'address', 'attributes', 'categories',
       'city', 'hours', 'is_open', 'latitude', 'longitude', 'name',
       'postal_code', 'review_count', 'state'], dtype=object)

In [9]:
# Preprocess the Philadelphia reviews dataset
def preprocess_data(reviews_df, top_n_categories=10):
    # Map user_id and business_id to unique indexes
    user_mapping = {user: idx for idx, user in enumerate(reviews_df['user_id'].unique())}
    business_mapping = {biz: idx for idx, biz in enumerate(reviews_df['business_id'].unique())}
    reviews_df['user_id_enc'] = reviews_df['user_id'].map(user_mapping)
    reviews_df['business_id_enc'] = reviews_df['business_id'].map(business_mapping)

    # Normalize numerical features
    for col in ['review_count', 'useful', 'funny', 'cool']:
        reviews_df[col] = reviews_df[col].fillna(0)  # Replace NaN with 0
        reviews_df[col] = (reviews_df[col] - reviews_df[col].mean()) / reviews_df[col].std()

    # Get the top N unique categories by frequency
    reviews_df['categories'] = reviews_df['categories'].fillna('')  # Handle NaN values
    reviews_df['categories_list'] = reviews_df['categories'].apply(lambda x: [cat.strip() for cat in x.split(',')])
    category_counts = {}
    for cat_list in reviews_df['categories_list']:
        for cat in cat_list:
            category_counts[cat] = category_counts.get(cat, 0) + 1
    top_categories = sorted(category_counts, key=category_counts.get, reverse=True)[:top_n_categories]

    # One-hot encode top N categories
    for category in top_categories:
        reviews_df[f"category_{category}"] = reviews_df['categories_list'].apply(lambda x: 1 if category in x else 0)

    # Create the final feature set
    category_features = [f"category_{cat}" for cat in top_categories]
    features = ['user_id_enc', 'business_id_enc', 'review_count', 'useful', 'funny', 'cool'] + category_features
    target = 'stars'
    reviews_df.drop(columns=['categories_list'], inplace=True)

    return reviews_df[features], reviews_df[target]

features, target = preprocess_data(phil_reviews_df)
features, target

  reviews_df['categories_list'] = reviews_df['categories'].apply(lambda x: [cat.strip() for cat in x.split(',')])


(       user_id_enc  business_id_enc  review_count    useful     funny  \
 75721            0                0     -0.445125  0.235642 -0.239951   
 80184            1                1      0.225609  1.235441 -0.239951   
 19864            2                2     -0.354486 -0.430890 -0.239951   
 76699            3                3      0.201007 -0.430890 -0.239951   
 92991            4                4     -0.364844 -0.430890 -0.239951   
 ...            ...              ...           ...       ...       ...   
 6265         17620              404     -0.258666 -0.097624 -0.239951   
 54886        59753             5170     -0.340242 -0.430890 -0.239951   
 76820        59754             6195     -0.594034 -0.430890 -0.239951   
 860          25821             2202      0.050804 -0.430890 -0.239951   
 15795        27755             2287     -0.239244 -0.097624 -0.239951   
 
            cool  category_Restaurants  category_Food  category_Nightlife  \
 75721 -0.266116                 

In [10]:
# Convert data to libSVM format
def to_libsvm_format(features, target, filename):
    with open(filename, 'w') as f:
        for i in range(features.shape[0]):
            row = features.iloc[i]
            label = target.iloc[i]
            # Create `feature_id:value` pairs
            features_str = " ".join([f"{idx}:{value}" for idx, value in enumerate(row) if value != 0])
            f.write(f"{label} {features_str}\n")

# Save dataset in libSVM format
to_libsvm_format(features, target, 'philadelphia_reviews.libsvm')

In [None]:
from fastFM.als import FMRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

XLearnLibraryNotFound: Cannot find xlearn Library in the candidate path