In [2]:
import tarfile
import pandas as pd
import json
import matplotlib.pyplot as plt


In [2]:
with open("../data/yelp_dataset/yelp_academic_dataset_review.json", "r", encoding="utf-8") as file:
    data = [json.loads (line) for line in file]

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data)

df

In [3]:
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
6990275,H0RIamZu0B0Ei0P4aeh3sQ,qskILQ3k0I_qcCMI-k6_QQ,jals67o91gcrD4DC81Vk6w,5.0,1,2,1,Latest addition to services from ICCU is Apple...,2014-12-17 21:45:20
6990276,shTPgbgdwTHSuU67mGCmZQ,Zo0th2m8Ez4gLSbHftiQvg,2vLksaMmSEcGbjI5gywpZA,5.0,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
6990277,YNfNhgZlaaCO5Q_YJR4rEw,mm6E4FbCMwJmb7kPDZ5v2Q,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
6990278,i-I4ZOhoX70Nw5H0FwrQUA,YwAMC-jvZ1fvEUum6QkEkw,Rr9kKArrMhSLVE9a53q-aA,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [4]:

# data preprocessing function. Takes in original dataset, minimum number of reviews per user (n), and category as arguments. 
def prepping_data(df, n, categories):

    # Group by user_id and get the count of occurrences
    user_counts = df.groupby('user_id').size().reset_index(name='number_of_reviews')

    user_counts = user_counts.sort_values(by= 'number_of_reviews', ascending = False)

    #filtering df and will write to csv later
    user_counts_n_plus = user_counts[user_counts['number_of_reviews'] >= n]

    #filtering original df to only include user ids wtih 5+ reviews each
    filtered_reviews = df[df['user_id'].isin(user_counts_n_plus['user_id'])]

    #bringing in the business reviews data set because I need it to track location of each restaurant
    with open("../data/yelp_dataset/yelp_academic_dataset_business.json", "r", encoding="utf-8") as file:
        data_business = [json.loads (line) for line in file]

    #deleting stars column before merge since the reviews dataset already has the star rating
    df_business = pd.DataFrame(data_business)
    del df_business['stars']

    # merging datasets to get location
    df_business_reviews = pd.merge(df_business, filtered_reviews, how='inner', on= 'business_id')

    #filtering for California restaurants only
    df_business_reviews_CA = df_business_reviews[df_business_reviews['state'] == 'CA']

    #renaming df 
    df_ca = df_business_reviews_CA

    #filtering for restaurants only 
    df_resturaunts_reviews = df_ca[df_ca['categories'].str.contains(categories, na=False)]

    # Group by 'user_id' and count the number of posts for each user
    post_counts = df_resturaunts_reviews.groupby('user_id').size()

    # Filter the users with fewer than 5 posts
    users_below_threshold = post_counts[post_counts < n].index

    # Remove these users from the dataframe. df_resturaunts_reviews_filtered contains only the users who have 5 or more posts
    df_resturaunts_reviews_filtered = df_resturaunts_reviews[~df_resturaunts_reviews['user_id'].isin(users_below_threshold)]

    #write to csv file. Note--file was too big to upload to github 
    df_resturaunts_reviews_filtered.to_csv(f'prepped_data_n_{n}_r.csv', index = False)
            
    return

In [5]:
#creating a prepped dataset using the original reviews dataset, a minimum reiews per user of 5, for restaurants only

prepping_data(df, 5, 'Restaurants')


## Note: prepped_data_n_5_r.csv was too big to upload to github so user will have to generate this file themselves

In [8]:
df_resturaunts_reviews = pd.read_csv('../data/prepped_data_n_5_r.csv')

#print(df_resturaunts_reviews['user_id'].nunique())

df_resturaunts_reviews.head(5)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,review_count,is_open,...,categories,hours,review_id,user_id,stars,useful,funny,cool,text,date
0,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,389,1,...,"Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '...",_OmwsZRl7Qrz7S2T8mnlRg,OVLf6NVTi7noMP1qCKr76w,5.0,0,1,1,Are you kidding me? This is without a doubt so...,2018-01-09 20:44:47
1,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,389,1,...,"Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '...",xinTYNBQflqAoKYBAztv2A,2ykh0_q0uMLFtypLDVSqlg,2.0,3,0,0,"Super slow service, ""salads"" are not very good...",2017-02-09 19:09:31
2,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,389,1,...,"Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '...",53exw7KNmp1bBpj40fBtmQ,cjpWGe6u6Wg8X7JmfGLHVw,2.0,0,0,0,I wanted wood-fired eggs from sister venue Luc...,2018-03-24 19:35:06
3,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,389,1,...,"Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '...",NN2WGLjM1o5EP8wtGtDtxg,HujQufkU2Y9lpZmeGkk5AQ,5.0,1,0,0,I've had a wide sampling of what HAB has to of...,2016-09-20 17:11:29
4,IDtLPgUrqorrpqSLdfMhZQ,Helena Avenue Bakery,"131 Anacapa St, Ste C",Santa Barbara,CA,93101,34.414445,-119.690672,389,1,...,"Food, Restaurants, Salad, Coffee & Tea, Breakf...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-14:0', '...",mMXZC5EGbEiO33hXx5tHgg,cjpWGe6u6Wg8X7JmfGLHVw,2.0,1,0,0,"I want to like the Helena Bakery, but they don...",2017-05-29 15:47:31


In [8]:
#creating function to generate user item matrix

def get_user_item_matrix(n):

    df_resturaunts_reviews_filtered = pd.read_csv(f'../data/prepped_data_n_{n}_r.csv')
    
    #filtering for relevant columns only
    df_restaurants = df_resturaunts_reviews_filtered[['business_id','user_id','stars']]

    #HANDLING DUPLICATES: 
    
    #averaging the ratings of users who left multiple reiews at the same restaurant

    # Find all rows where 'user_id' and 'business_id' are duplicated
    duplicates_df = df_restaurants[df_restaurants.duplicated(subset=['user_id', 'business_id'], keep=False)]

    # Sort the DataFrame for better visibility
    sorted_duplicates_df = duplicates_df.sort_values(by=['user_id', 'business_id'])

    #taking averagies of user-item duplicates
    duplicates_grouped = sorted_duplicates_df.groupby(['user_id','business_id'])['stars'].mean().reset_index()

    #Deleting duplicates from original dataframe, appending duplicates_grouped to data frame

    # Step 1: Remove all rows with duplicates from the original DataFrame
    df_without_duplicates = df_restaurants[~df_restaurants.duplicated(subset=['user_id', 'business_id'], keep=False)]

    # Step 2: Append the rows from duplicates_grouped to df_without_duplicates
    final_df = pd.concat([df_without_duplicates, duplicates_grouped], ignore_index=True)

    # Group by 'user_id' and count the number of posts for each user
    post_counts = final_df.groupby('user_id').size()

    # Filter the users with fewer than n posts 
    users_below_threshold = post_counts[post_counts < n].index

    # Remove these users from the dataframe. df_resturaunts_reviews_filtered contains only the users who have 5 or more posts
    final_df_filtered = final_df[~final_df['user_id'].isin(users_below_threshold)]

    #pivoting df to create UI matrix where rows are users and columns are businesses
    user_item_matrix = final_df_filtered.pivot(index='user_id', columns='business_id', values='stars')

    #writing to csv
    user_item_matrix.to_csv(f'UI_matrix_n_{n}_r.csv')


    return user_item_matrix


UI_matrix_n5 = get_user_item_matrix(5)

# Confirming each user left 5+ reviews in UI matrix

In [6]:
UI_matrix_n5 = pd.read_csv('../data/UI_matrix_n_5_r.csv')


# Count the number of non-NaN interactions/items for each user
interactions_per_user = UI_matrix_n5.count(axis=1)

# Check how many users have fewer than 10 interactions
users_below_threshold = interactions_per_user[interactions_per_user < 5].count()

print(users_below_threshold)

0


In [11]:
UI_matrix_n5

Unnamed: 0,user_id,--onnLZrsCazmcy2P_7fcw,-3AooxIkg38UyUdlz5oXdw,-8iATYRnN46Km0_-ldx6cg,-9r8nAzWyRSLxBWt8uQOdA,-ALqLSTzkGDMscHdxA1NgA,-BdYhP-12elmFV7oB1iv4A,-FM4CxOg4XXmX_Ebky_SiQ,-FRHYI6doGCjIsXiKHkUqw,-Fka99c-tJ-epWYNIobqyQ,...,zVxDd79gdZcfz-O14QBCrA,zWwGjQlUEiLS7hCXDkBGzg,zYTQOpgMSMaFNkcCAvJR3Q,zZrDoiQIUmiVkifJx0h_KA,zbrIMldF_O1ZQ0vpUaaa8A,zeAfmYy9b1gfUJLRBHm6vQ,ziXR7sUrbKRCNeDqfjibUg,zpuFEeAhrNzXPkOlBf5Kog,zu4p6IZLSVn2Noto-vcwzw,zxW8zECvT_SqejieMMjb5A
0,-0-TtVhV4PIUoDpUCOC0uQ,,,,,,,,,,...,,,,,,,,,,
1,-0EcgtUXe1rzrkmdih_tYg,,,,,,,,,,...,,,,,,,,,,
2,-1-ECBsGpG4Iw5s-ecnfqw,,,,,,,5.0,,,...,,,,,,,,,,
3,-14MA777BbjUQLw0zndvfA,,,,,,,,,,...,,,,,,,,,,
4,-1WbN1Qd-opw8u3uEqs2Kg,,,,,,,,,4.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6504,zuPjUmHmYz1TIiamDjgJgg,,,,,,,,,,...,,,,,,,,,,
6505,zv6CI7HvLGRHgu6yqX-Asw,,,,,,,,,,...,,,,,,,,,,
6506,zvs6DYpACuh0cpsPDO48dw,,,,,,,,,,...,,,,,,,,,,
6507,zxAM0cdIbZZYN-6gVQuntQ,,,,,,,,,,...,,,,,,,,,,
