# Yelp dataset

In [None]:
import pandas as pd
import os.path
import numpy as np
from tqdm import tqdm
import holidays

**User:**
- name
- friend list
**Tip:** Niente

**Review:**
- user id
- business id
- stars
- date

**Checkin:** Niente

**Business:**
- categories
- attributes

## Merge datasets

In [None]:
def merge_datasets():
    if os.path.exists('Datasets/yelp dataset/yelp_dataset_merged.csv'):
        print("merged dataset already exists, skipping merge...")
        return
    
    df = pd.read_json('Datasets/yelp dataset/yelp_academic_dataset_business.json', lines=True)
    df = df[['business_id', 'city', 'categories', 'attributes']]
    df = df[df.city == 'Toronto'] # keep only Toronto, the city with more rating
    df = df[df['categories'].str.contains('Restaurant.*')==True].reset_index(drop=True) # keep only restaurant

    chunk_list = []  # append each dataframe chunk here 
    for chunk in tqdm(pd.read_json('Datasets/yelp dataset/yelp_academic_dataset_review.json', lines=True, chunksize=500000)):
        chunk = chunk[['user_id', 'business_id', 'stars', 'date']]
        chunk = pd.merge(chunk, df, on='business_id') # merge business dataset and review dataset chunk
        chunk_list.append(chunk)
    df = pd.concat(chunk_list)
    df.to_csv('Datasets/yelp dataset/yelp_dataset_merged.csv', index = False) # save dataset to CSV file
    
merge_datasets()
df = pd.read_csv('Datasets/yelp dataset/yelp_dataset_merged.csv')

In [None]:
df = df.rename(columns={'user_id':'user', 'business_id':'item', 'stars':'rating'})
#df = df[(df.groupby('user')['user'].transform('size') > 10) & (df.groupby('item')['item'].transform('size') > 10)]
df = df[(df.groupby('user')['user'].transform('size') > 20)]
df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0) # make rating binary
df = df.drop(columns='city') # drop city column since we are using only Toronto
df = df.dropna() # drop any row with NaN values

#df = df.groupby('rating').apply(lambda x: x.sample(1000))
df = df.reset_index(drop=True)

print(f'row: {len(df)} \t user: {df.user.nunique()} \t item:{df.item.nunique()}')

# make user and items id start from 0
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]


print(f' rating: {df.rating.value_counts()}')

## Context features

In [None]:
def season_from_date(date):
    seasons = np.arange(12)
    seasons = seasons.reshape(4, 3) # reshape to a 2D matrix
    i, j = np.where(seasons == date.month - 1) # get row where month appears
    return i[0] + 1 # 1 = winter, 2 = spring, 3 = summer, 4 = autumn

def holiday_from_date(date):
    us_holidays = holidays.Canada()
    return date in us_holidays

In [None]:
df['date'] = pd.to_datetime(df['date']) # convert from string to datetime
df['season'] = df['date'].apply(season_from_date)
df['weekday'] = df['date'].dt.dayofweek # get day of the week from date (0 to 6)
df['weekend'] = (df['weekday'] == 6) | (df['weekday'] == 5) # if is weekend from week day
df['holiday'] = df['date'].apply(holiday_from_date) # get holiday in Canada from date
df = df.drop(columns=['date']) 
context = 'season weekday weekend holiday'.split()

## Item Features

In [None]:
df["attributes"] = df["attributes"].apply(lambda x : dict(eval(x))) # convert to dict
df = df.join(pd.json_normalize(df.attributes)).drop('attributes', axis=1) # expand dictionaries to new columns

In [None]:
"""
for c in df.columns:
    print('-'*10 + c + '-'*10)
    print(df[c].value_counts())
"""

In [None]:
attributes = 'Caters RestaurantsAttire RestaurantsPriceRange2 HasTV NoiseLevel RestaurantsDelivery RestaurantsReservations GoodForKids RestaurantsTakeOut Alcohol OutdoorSeating RestaurantsGoodForGroups GoodForMeal Ambience'.split()
df = df[['user', 'item', 'rating', 'categories'] + context + attributes]  # keep only some attributes
df = df.replace(to_replace='None', value=np.nan) # replace None strings with NaN
df = df.dropna() # Drop NaN value

In [None]:
for col in 'GoodForMeal Ambience'.split(): # some attributes need to be expanded again
    df[col] = df[col].apply(lambda x : dict(eval(x))) # convert to dict
    df = df.join(pd.json_normalize(df[col])).drop(col, axis=1) # expand dictionaries to new columns

In [None]:
df = df.replace(to_replace='None', value=False) # replace None with False
df = df.fillna(False) # replace NaN with False
df = df.dropna() # Drop NaN value
df

In [205]:
df.columns

Index(['user', 'item', 'rating', 'categories', 'season', 'weekday', 'weekend',
       'holiday', 'Caters', 'RestaurantsAttire', 'RestaurantsPriceRange2',
       'HasTV', 'NoiseLevel', 'RestaurantsDelivery', 'RestaurantsReservations',
       'GoodForKids', 'RestaurantsTakeOut', 'Alcohol', 'OutdoorSeating',
       'RestaurantsGoodForGroups', 'dessert', 'latenight', 'lunch', 'dinner',
       'brunch', 'breakfast', 'romantic', 'intimate', 'classy', 'hipster',
       'divey', 'touristy', 'trendy', 'upscale', 'casual'],
      dtype='object')

In [221]:
def get_user_feature_by_rating(df, column):
    fav_values = np.zeros((df.user.nunique(), 2), dtype=object)
    for user in df.user.unique():
        # group by column unique values and sum ratings
        grouped = df[['user', column, 'rating']][df.user == user].groupby(['user', column]).sum().sort_values('rating')
        fav_val = grouped.tail(1).index.get_level_values(1).tolist()[0] # get value with highest rating sum
        fav_values[user,:] = [user, fav_val] # add to numpy array of (user, fav_val)
    return pd.DataFrame(fav_values, columns=['user', 'user_'+column]) # numpy to dataframe

df = pd.merge(df, get_user_feature_by_rating(df, 'RestaurantsPriceRange2'), on=['user'])
df = pd.merge(df, get_user_feature_by_rating(df, 'Alcohol'), on=['user'])
df = pd.merge(df, get_user_feature_by_rating(df, 'RestaurantsDelivery'), on=['user'])
df = pd.merge(df, get_user_feature_by_rating(df, 'RestaurantsReservations'), on=['user'])
df = pd.merge(df, get_user_feature_by_rating(df, 'GoodForKids'), on=['user'])
df = pd.merge(df, get_user_feature_by_rating(df, 'RestaurantsGoodForGroups'), on=['user'])

In [226]:
attributes = df.columns[8:-6].to_list()
user_features = df.columns[-6:].to_list()

['user_RestaurantsPriceRange2',
 'user_Alcohol',
 'user_RestaurantsDelivery',
 'user_RestaurantsReservations',
 'user_GoodForKids',
 'user_RestaurantsGoodForGroups']

## Encoding

In [227]:
# convert categorical data to one-hot encoding
for col in context + attributes + user_features:
  df = pd.get_dummies(df, columns=[col], prefix = [col])

## Categories

In [228]:
df_categories = pd.Series(df['categories']).str.get_dummies(',')
df = pd.concat([df, df_categories], axis=1)
df = df.dropna()
df = df.drop(columns=['categories']) 

In [229]:
df = df.reset_index(drop=True)
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]
df.to_csv('Datasets/yelp dataset/yelp_final.csv', index = False) 
df = df.drop_duplicates(subset=['user', 'item']) # drop duplicates for matrix factorization
df = df[['user', 'item', 'rating']]
df = df.reset_index(drop=True)
df.to_csv('Datasets/yelp dataset/yelp_matrix_factorization.csv', index = False) 