# Yelp dataset

In [None]:
import pandas as pd
import os.path
import numpy as np
from tqdm import tqdm
import holidays

**User:**
- name
- friend list
**Tip:** Niente

**Review:**
- user id
- business id
- stars
- date

**Checkin:** Niente

**Business:**
- categories
- attributes

## Merge datasets

In [86]:
def merge_datasets():
    if os.path.exists('Datasets/yelp dataset/yelp_dataset_merged.csv'):
        print("merged dataset already exists, skipping merge...")
        return
    
    df = pd.read_json('Datasets/yelp dataset/yelp_academic_dataset_business.json', lines=True)
    df = df[['business_id', 'city', 'categories', 'attributes']]
    df = df[df.city == 'Toronto'] # keep only Toronto, the city with more rating
    df = df[df['categories'].str.contains('Restaurant.*')==True].reset_index(drop=True) # keep only restaurant

    chunk_list = []  # append each chunk df here 
    # open the big review dataset in little chunk
    for chunk in tqdm(pd.read_json('Datasets/yelp dataset/yelp_academic_dataset_review.json', lines=True, chunksize=500000)):
        chunk = chunk[['user_id', 'business_id', 'stars', 'date']]
        chunk = pd.merge(chunk, df, on='business_id') # merge business dataset and review dataset chunk
        chunk_list.append(chunk)
    df = pd.concat(chunk_list)
    df.to_csv('Datasets/yelp dataset/yelp_dataset_merged.csv', index = False) # save dataset to CSV file
    
merge_datasets()
df = pd.read_csv('Datasets/yelp dataset/yelp_dataset_merged.csv')

merged dataset already exists, skipping merge...


In [87]:
df = df.rename(columns={'user_id':'user', 'business_id':'item', 'stars':'rating'})
#df = df[(df.groupby('user')['user'].transform('size') > 10) & (df.groupby('item')['item'].transform('size') > 10)]
df = df[(df.groupby('user')['user'].transform('size') > 20)]
df['rating'] = df['rating'].apply(lambda x: 1 if x > 3 else 0) # make rating binary

#df = df.groupby('rating').apply(lambda x: x.sample(40000))
df = df.reset_index(drop=True)

print(f'row: {len(df)} \t user: {df.user.nunique()} \t item:{df.item.nunique()}')

# make user and items id start from 0
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]


print(f' rating: {df.rating.value_counts()}')

row: 174468 	 user: 3309 	 item:8362
 rating: 1    105110
0     69358
Name: rating, dtype: int64


## Extract new features

In [88]:
def season_from_date(date):
    seasons = np.arange(12)
    seasons = seasons.reshape(4, 3) # reshape to a 2D matrix
    i, j = np.where(seasons == date.month - 1) # get row where month appears
    return i[0] + 1 # 1 = winter, 2 = spring, 3 = summer, 4 = autumn

def holiday_from_date(date):
    us_holidays = holidays.Canada()
    return date in us_holidays

In [89]:
df['date'] = pd.to_datetime(df['date']) # convert from string to datetime
df['season'] = df['date'].apply(season_from_date)
df['weekday'] = df['date'].dt.dayofweek # get day of the week from date (0 to 6)
df['weekend'] = (df['weekday'] == 6) | (df['weekday'] == 5) # if is weekend from week day
df['holiday'] = df['date'].apply(holiday_from_date) # get holiday in Canada from date
df = df.drop(columns=['date', 'city', 'categories', 'attributes']) 

## Encoding

In [90]:
one_hot = 'season weekday weekend holiday'.split()
# convert categorical data to one-hot encoding
for col in one_hot:
  df = pd.get_dummies(df, columns=[col], prefix = [col])

In [91]:
df.to_csv('Datasets/yelp dataset/yelp_final.csv', index = False) 
df = df.drop_duplicates(subset=['user', 'item']) # drop duplicates for matrix factorization
df = df[['user', 'item', 'rating']]
df = df.reset_index(drop=True)
df.to_csv('Datasets/yelp dataset/yelp_matrix_factorization.csv', index = False) 