# Yelp dataset

## Import

In [None]:
import pandas as pd
import os.path
import numpy as np
from tqdm import tqdm
import swifter
from sklearn.preprocessing import MinMaxScaler
import holidays

## Clean and merge
Load the dataset in chunk to avoid memory overflow, delete not used column, and merge it with info about business activities

In [None]:
def build_yelp_dataset():
    if os.path.exists('yelp dataset/yelp_dataset_merged.csv'):
        return
    df = pd.read_json('yelp dataset/yelp_academic_dataset_business.json', lines=True)
    df = df[df.is_open == 1] # consider only opened business
    df = df.reset_index(drop=True)
    df = df[['business_id', 'latitude', 'longitude']] # keep only this columns

    chunk_list = []  # append each chunk df here 
    # open the big review dataset in little chunk
    for count,chunk in enumerate(pd.read_json('yelp dataset/yelp_academic_dataset_review.json', lines=True, chunksize=500000)):
        chunk = chunk[['user_id', 'business_id', 'stars', 'date']]
        chunk = pd.merge(chunk, df, on='business_id') # merge business dataset and review dataset chunk
        chunk_list.append(chunk)
        print(f'processed {count + 1} chunks')
    df_concat = pd.concat(chunk_list)
    df_concat.to_csv('yelp dataset/yelp_dataset_merged.csv', index = False) # save dataset to CSV file

## Dataset preprocessing
We use the following contextual features: 
- year
- month
- day of the week
- week number
- longitude
- latitude

We extracted further contextual features from the date and location features: 
- season
- isHoliday
- isWeekend.

In [None]:
df = pd.read_csv('yelp dataset/yelp_dataset_merged.csv')
df.rename(columns={'user_id': 'user', 'business_id': 'item'}, inplace=True)

# convert user and item IDs to int64
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]
df

In [92]:
# return a season from a date in the format d/m/Y
def season_from_date(date):
    year = str(date.year)
    seasons = {1: pd.date_range(start='21/03/'+year, end='20/06/'+year), # spring
               2: pd.date_range(start='21/06/'+year, end='22/09/'+year), # summer
               3: pd.date_range(start='23/09/'+year, end='20/12/'+year)} # autumn
    if date in seasons[1]:
        return 'spring'
    elif date in seasons[2]:
        return 'summer'
    elif date in seasons[3]:
        return 'autumn'
    else:
        return 'winter'

# faster version that consider genuary, february and march as winter and so on... you don't consider the day
def season_from_date_fast(date):
    seasons = np.arange(12)
    seasons = seasons.reshape(4, 3) # reshape to a 2D matrix
    i, j = np.where(seasons == date.month - 1) # get row where month appears
    return i[0] + 1 # 1 = winter, 2 = spring, 3 = summer, 4 = autumn

def holiday_from_date(date):
    us_holidays = holidays.UnitedStates()
    return date in us_holidays

In [None]:
df['date'] = pd.to_datetime(df['date']) # convert to date type
df['season'] = df['date'].swifter.apply(season_from_date_fast)

In [None]:
df['weekday'] = df['date'].dt.dayofweek # get day of the week from date
df['weekend'] = (df['weekday'] == 6) | (df['weekday'] == 5) # if is weekend from week day
df['weeknumber'] = df['date'].dt.isocalendar().week # get week number

In [93]:
us_holidays = holidays.UnitedStates() # review in the dataset are all from US
df['holiday'] = df['date'].swifter.apply(holiday_from_date) # get holiday in US based on date

Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

In [102]:
df = df[['user', 'item', 'stars', 'weekend', 'holiday', 'weeknumber', 'season', 'latitude', 'longitude']]
df

Unnamed: 0,user,item,stars,weekend,holiday,weeknumber,season,latitude,longitude
0,0,0,2,False,False,16,2,36.112896,-115.177637
1,1,0,2,False,False,11,1,36.112896,-115.177637
2,2,0,3,False,False,41,4,36.112896,-115.177637
3,3,0,2,False,False,47,4,36.112896,-115.177637
4,4,0,3,True,False,40,4,36.112896,-115.177637
...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,False,False,50,4,43.686134,-79.607682
6886521,966056,167634,5,False,False,7,1,36.128561,-115.171130
6886522,264157,156531,1,False,False,49,4,33.304061,-111.979869
6886523,431787,166910,1,False,False,31,3,36.112846,-115.225469


## Encoding

- Categorical feature **season** is encoded with one hot encoding
- Numeric features **latitude**, **longitude** and **weeknumber** are normalized
- Boolean features **isweekend** and **isholiday** are converted to 0/1

In [103]:
# convert categorical data to one-hot encoding
df = pd.get_dummies(df, columns=['season'], prefix = ['season'])

In [104]:
# min max normalization for latitude and longitude
mms = MinMaxScaler()
df[['latitude','longitude', 'weeknumber']] = mms.fit_transform(df[['latitude','longitude', 'weeknumber']])

In [105]:
df['weekend'] = df['weekend'].astype('uint8')
df['holiday'] = df['holiday'].astype('uint8')

In [106]:
df.info(memory_usage='deep')
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6886525 entries, 0 to 6886524
Data columns (total 12 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user        int64  
 1   item        int64  
 2   stars       int64  
 3   weekend     uint8  
 4   holiday     uint8  
 5   weeknumber  float64
 6   latitude    float64
 7   longitude   float64
 8   season_1    uint8  
 9   season_2    uint8  
 10  season_3    uint8  
 11  season_4    uint8  
dtypes: float64(3), int64(3), uint8(6)
memory usage: 354.6 MB


Unnamed: 0,user,item,stars,weekend,holiday,weeknumber,latitude,longitude,season_1,season_2,season_3,season_4
0,0,0,2,0,0,0.288462,0.490413,0.502798,0,1,0,0
1,1,0,2,0,0,0.192308,0.490413,0.502798,1,0,0,0
2,2,0,3,0,0,0.769231,0.490413,0.502798,0,0,0,1
3,3,0,2,0,0,0.884615,0.490413,0.502798,0,0,0,1
4,4,0,3,1,0,0.750000,0.490413,0.502798,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,0,0,0.942308,0.744526,0.920192,0,0,0,1
6886521,966056,167634,5,0,0,0.115385,0.490939,0.502874,1,0,0,0
6886522,264157,156531,1,0,0,0.923077,0.396166,0.540322,0,0,0,1
6886523,431787,166910,1,0,0,0.576923,0.490412,0.502236,0,0,1,0


In [107]:
save_folder = 'final datasets'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
df.to_csv(save_folder + '/yelp_final.csv', index = False) # save dataset to CSV file