# Yelp dataset

## Import

In [None]:
import pandas as pd
import os.path
import multiprocessing
import numpy as np
from tqdm import tqdm
from multiprocessing import  Pool
import swifter

## Clean and merge
Load the dataset in chunk to avoid memory overflow, delete not used column, and merge it with info about business activities

In [None]:
def build_yelp_dataset():
    if os.path.exists('yelp dataset/yelp_dataset_merged.csv'):
        return
    df = pd.read_json('yelp dataset/yelp_academic_dataset_business.json', lines=True)
    df = df[df.is_open == 1] # consider only opened business
    df = df.reset_index(drop=True)
    df = df[['business_id', 'latitude', 'longitude']] # keep only this columns

    chunk_list = []  # append each chunk df here 
    # open the big review dataset in little chunk
    for count,chunk in enumerate(pd.read_json('yelp dataset/yelp_academic_dataset_review.json', lines=True, chunksize=500000)):
        chunk = chunk[['user_id', 'business_id', 'stars', 'date']]
        chunk = pd.merge(chunk, df, on='business_id') # merge business dataset and review dataset chunk
        chunk_list.append(chunk)
        print(f'processed {count + 1} chunks')
    df_concat = pd.concat(chunk_list)
    df_concat.to_csv('yelp dataset/yelp_dataset_merged.csv', index = False) # save dataset to CSV file

## Dataset preprocessing
We use the following contextual features: 
- year
- month
- day of the week
- week number
- longitude
- latitude

We extracted further contextual features from the date and location features: 
- season
- isHoliday
- isWeekend.

In [None]:
df = pd.read_csv('yelp dataset/yelp_dataset_merged.csv')
df.rename(columns={'user_id': 'user', 'business_id': 'item'}, inplace=True)

# convert user and item IDs to int64
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]
df

In [52]:
# return a season from a date in the format d/m/Y
def season_from_date(date):
    year = str(date.year)
    seasons = {1: pd.date_range(start='21/03/'+year, end='20/06/'+year), # spring
               2: pd.date_range(start='21/06/'+year, end='22/09/'+year), # summer
               3: pd.date_range(start='23/09/'+year, end='20/12/'+year)} # autumn
    if date in seasons[1]:
        return 'spring'
    elif date in seasons[2]:
        return 'summer'
    elif date in seasons[3]:
        return 'autumn'
    else:
        return 'winter'

def season_from_date_fast(date):
    winter = np.arange(1, 4)
    spring = np.arange(4, 7)
    summer = np.arange(7, 10)
    autumn = np.arange(10, 13)
    
season_from_date_fast(df)

[1 2 3]
[4 5 6]
[7 8 9]
[10 11 12]


In [None]:
# CONTEXT FEATURES TO ADD:
# year, month, day of the week, week number --> season, isHoliday, and isWeekend.
df['date'] = pd.to_datetime(df['date']).dt.strftime('%d/%m/%Y') # convert date from object to date type
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y') # convert from Y/m/d to d/m/Y

In [None]:
# tqdm.pandas() 
# df['season'] = df[:1000]['date'].progress_apply(season_from_date) # get season from month and day
#df

In [None]:
df['season'] = df[:10]['date'].swifter.apply(season_from_date) # run apply function in parallel with swifter to get season from date
df

In [17]:
df['weekday'] = df['date'].dt.dayofweek # get day of the week from date
df['weekend'] = (df['weekday'] == 6) | (df['weekday'] == 5) # if is weekend from week day
df['weeknumber'] = df['date'].dt.isocalendar().week # get week number
df.head(10)

Unnamed: 0,user,item,stars,date,latitude,longitude,season,weekday,weekend,weeknumber
0,0,0,2,2015-04-15,36.112896,-115.177637,spring,2,False,16
1,1,0,2,2014-03-14,36.112896,-115.177637,winter,4,False,11
2,2,0,3,2015-10-07,36.112896,-115.177637,autumn,2,False,41
3,3,0,2,2015-11-18,36.112896,-115.177637,autumn,2,False,47
4,4,0,3,2010-10-10,36.112896,-115.177637,autumn,6,True,40
5,5,0,1,2015-08-20,36.112896,-115.177637,summer,3,False,34
6,6,0,5,2012-03-31,36.112896,-115.177637,spring,5,True,13
7,7,0,3,2013-05-09,36.112896,-115.177637,spring,3,False,19
8,8,0,2,2013-05-11,36.112896,-115.177637,spring,5,True,19
9,9,0,4,2010-10-09,36.112896,-115.177637,autumn,5,True,40


In [21]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df

Unnamed: 0,user,item,stars,date,latitude,longitude,season,weekday,weekend,weeknumber,year,month
0,0,0,2,2015-04-15,36.112896,-115.177637,spring,2,False,16,2015,4
1,1,0,2,2014-03-14,36.112896,-115.177637,winter,4,False,11,2014,3
2,2,0,3,2015-10-07,36.112896,-115.177637,autumn,2,False,41,2015,10
3,3,0,2,2015-11-18,36.112896,-115.177637,autumn,2,False,47,2015,11
4,4,0,3,2010-10-10,36.112896,-115.177637,autumn,6,True,40,2010,10
...,...,...,...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,2019-12-09,43.686134,-79.607682,,0,False,50,2019,12
6886521,966056,167634,5,2019-02-13,36.128561,-115.171130,,2,False,7,2019,2
6886522,264157,156531,1,2019-12-04,33.304061,-111.979869,,2,False,49,2019,12
6886523,431787,166910,1,2012-07-31,36.112846,-115.225469,,1,False,31,2012,7


In [24]:
df = df[['user', 'item', 'stars', 'month', 'year', 'weekday', 'weekend', 'weeknumber', 'season', 'latitude', 'longitude']]
df.dtypes

user            int64
item            int64
stars           int64
month           int64
year            int64
weekday         int64
weekend          bool
weeknumber     UInt32
season         object
latitude      float64
longitude     float64
dtype: object

## Encoding

- Categorical features **weekday** and **weekend** are encoded with one hot encoding.

- Cyclical features **month**, **week number** and **season** are encoded into two dimensions using a sine and consine transformation:
$$
x_{sin} = \sin(\frac{2 \cdot \pi \cdot x}{max(x)}) \\
x_{cos} = \cos(\frac{2 \cdot \pi \cdot x}{max(x)})
$$
- Numeric features **year**, **latitude** and **longitude** are normalized


In [26]:
def sin_cos_encoding(df, col, max_val):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_val)
    return df

In [40]:
df = sin_cos_encoding(df, 'month', df.month.max()) # sin cos encoding for months
df = sin_cos_encoding(df, 'weeknumber', df.weeknumber.max()) # sin cos encoding for week number
df

Unnamed: 0,user,item,stars,month,year,weekday,weekend,weeknumber,season,latitude,longitude,month_sin,month_cos,weeknumber_sin,weeknumber_cos
0,0,0,2,4,2015,2,False,16,spring,36.112896,-115.177637,8.660254e-01,-5.000000e-01,0.947326,-0.320270
1,1,0,2,3,2014,4,False,11,winter,36.112896,-115.177637,1.000000e+00,6.123234e-17,0.964636,0.263587
2,2,0,3,10,2015,2,False,41,autumn,36.112896,-115.177637,-8.660254e-01,5.000000e-01,-0.989040,0.147647
3,3,0,2,11,2015,2,False,47,autumn,36.112896,-115.177637,-5.000000e-01,8.660254e-01,-0.652822,0.757511
4,4,0,3,10,2010,6,True,40,autumn,36.112896,-115.177637,-8.660254e-01,5.000000e-01,-0.999561,0.029633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,12,2019,0,False,50,,43.686134,-79.607682,-2.449294e-16,1.000000e+00,-0.348202,0.937420
6886521,966056,167634,5,2,2019,2,False,7,,36.128561,-115.171130,8.660254e-01,5.000000e-01,0.737833,0.674983
6886522,264157,156531,1,12,2019,2,False,49,,33.304061,-111.979869,-2.449294e-16,1.000000e+00,-0.456629,0.889657
6886523,431787,166910,1,7,2012,1,False,31,,36.112846,-115.225469,-5.000000e-01,-8.660254e-01,-0.508531,-0.861044


12