# Yelp dataset

## Import

In [1]:
import pandas as pd
import os.path
import multiprocessing
import numpy as np

## Clean and merge
Load the dataset in chunk to avoid memory overflow, delete not used column, and merge it with info about business activities

In [2]:
def build_yelp_dataset():
    if os.path.exists('yelp dataset/yelp_dataset_merged.csv'):
        return
    df = pd.read_json('yelp dataset/yelp_academic_dataset_business.json', lines=True)
    df = df[df.is_open == 1] # consider only opened business
    df = df.reset_index(drop=True)
    df = df[['business_id', 'latitude', 'longitude']] # keep only this columns

    chunk_list = []  # append each chunk df here 
    # open the big review dataset in little chunk
    for count,chunk in enumerate(pd.read_json('yelp dataset/yelp_academic_dataset_review.json', lines=True, chunksize=500000)):
        chunk = chunk[['user_id', 'business_id', 'stars', 'date']]
        chunk = pd.merge(chunk, df, on='business_id') # merge business dataset and review dataset chunk
        chunk_list.append(chunk)
        print(f'processed {count + 1} chunks')
    df_concat = pd.concat(chunk_list)
    df_concat.to_csv('yelp dataset/yelp_dataset_merged.csv', index = False) # save dataset to CSV file

## Dataset preprocessing
We use the following contextual features: 
- year
- month
- day of the week
- week number
- longitude
- latitude

We extracted further contextual features from the date and location features: 
- season
- isHoliday
- isWeekend.

In [3]:
df = pd.read_csv('yelp dataset/yelp_dataset_merged.csv')
df

Unnamed: 0,user_id,business_id,stars,date,latitude,longitude
0,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,2015-04-15 05:21:16,36.112896,-115.177637
1,owbC7FP8SNAlwv6f9S5Stw,-MhfebM0QIsKt87iDN-FNw,2,2014-03-14 08:24:25,36.112896,-115.177637
2,v9vGnjphb0Hta0lvtf5haA,-MhfebM0QIsKt87iDN-FNw,3,2015-10-07 22:16:59,36.112896,-115.177637
3,AXuHgGQoNPkiSXTxHlQc0A,-MhfebM0QIsKt87iDN-FNw,2,2015-11-18 22:20:55,36.112896,-115.177637
4,LkWNo83Lg92C5V4JEyxOZA,-MhfebM0QIsKt87iDN-FNw,3,2010-10-10 01:27:31,36.112896,-115.177637
...,...,...,...,...,...,...
6886520,T85j1MMV_DJQd3gWQ1zt6w,FEC44uuZ4_FeDYZ-vTCQAg,1,2019-12-09 18:57:49,43.686134,-79.607682
6886521,XbIHnYAJW3UJ4uRDWkRFjQ,_2mogknn8udxrYCt3G-W6Q,5,2019-02-13 19:53:09,36.128561,-115.171130
6886522,QW-MVWcbmdOz1mCFuc3yPw,_F-rcHoYLX3r6B-ZUo7ukg,1,2019-12-04 19:01:23,33.304062,-111.979869
6886523,sVR6XQh9MZc2cpiZmK_lpg,bzS33HppGW7eHlFqi1wQGw,1,2012-07-31 03:34:14,36.112846,-115.225469


In [4]:
df.rename(columns={'user_id': 'user', 'business_id': 'item'}, inplace=True)
df

Unnamed: 0,user,item,stars,date,latitude,longitude
0,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,2015-04-15 05:21:16,36.112896,-115.177637
1,owbC7FP8SNAlwv6f9S5Stw,-MhfebM0QIsKt87iDN-FNw,2,2014-03-14 08:24:25,36.112896,-115.177637
2,v9vGnjphb0Hta0lvtf5haA,-MhfebM0QIsKt87iDN-FNw,3,2015-10-07 22:16:59,36.112896,-115.177637
3,AXuHgGQoNPkiSXTxHlQc0A,-MhfebM0QIsKt87iDN-FNw,2,2015-11-18 22:20:55,36.112896,-115.177637
4,LkWNo83Lg92C5V4JEyxOZA,-MhfebM0QIsKt87iDN-FNw,3,2010-10-10 01:27:31,36.112896,-115.177637
...,...,...,...,...,...,...
6886520,T85j1MMV_DJQd3gWQ1zt6w,FEC44uuZ4_FeDYZ-vTCQAg,1,2019-12-09 18:57:49,43.686134,-79.607682
6886521,XbIHnYAJW3UJ4uRDWkRFjQ,_2mogknn8udxrYCt3G-W6Q,5,2019-02-13 19:53:09,36.128561,-115.171130
6886522,QW-MVWcbmdOz1mCFuc3yPw,_F-rcHoYLX3r6B-ZUo7ukg,1,2019-12-04 19:01:23,33.304062,-111.979869
6886523,sVR6XQh9MZc2cpiZmK_lpg,bzS33HppGW7eHlFqi1wQGw,1,2012-07-31 03:34:14,36.112846,-115.225469


In [5]:
# convert user and item IDs to int64
df.user = pd.factorize(df.user)[0]
df.item = pd.factorize(df.item)[0]
df

Unnamed: 0,user,item,stars,date,latitude,longitude
0,0,0,2,2015-04-15 05:21:16,36.112896,-115.177637
1,1,0,2,2014-03-14 08:24:25,36.112896,-115.177637
2,2,0,3,2015-10-07 22:16:59,36.112896,-115.177637
3,3,0,2,2015-11-18 22:20:55,36.112896,-115.177637
4,4,0,3,2010-10-10 01:27:31,36.112896,-115.177637
...,...,...,...,...,...,...
6886520,1836147,159135,1,2019-12-09 18:57:49,43.686134,-79.607682
6886521,966056,167634,5,2019-02-13 19:53:09,36.128561,-115.171130
6886522,264157,156531,1,2019-12-04 19:01:23,33.304062,-111.979869
6886523,431787,166910,1,2012-07-31 03:34:14,36.112846,-115.225469


In [35]:
# return a season from a date in the format d/m/Y
def season_from_date(date) -> int:
    year = str(date.year)
    seasons = {1: pd.date_range(start='21/03/'+year, end='20/06/'+year), # spring
               2: pd.date_range(start='21/06/'+year, end='22/09/'+year), # summer
               3: pd.date_range(start='23/09/'+year, end='20/12/'+year)} # autumn
    if date in seasons[1]:
        return 'spring'
    elif date in seasons[2]:
        return 'summer'
    elif date in seasons[3]:
        return 'autumn'
    else:
        return 'winter'

In [7]:
# CONTEXT FEATURES TO ADD:
# year, month, day of the week, week number --> season, isHoliday, and isWeekend.

df['date'] = pd.to_datetime(df['date']).dt.strftime('%d/%m/%Y') # convert date from object to date type
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y') # convert from Y/m/d to d/m/Y

In [36]:
df['season'] = df[:100]['date'].map(season_from_date) # get season based on date
df

Unnamed: 0,user,item,stars,date,latitude,longitude,season,weekday,weeknumber
0,0,0,2,2015-04-15,36.112896,-115.177637,spring,2,16
1,1,0,2,2014-03-14,36.112896,-115.177637,winter,4,11
2,2,0,3,2015-10-07,36.112896,-115.177637,autumn,2,41
3,3,0,2,2015-11-18,36.112896,-115.177637,autumn,2,47
4,4,0,3,2010-10-10,36.112896,-115.177637,autumn,6,40
...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,2019-12-09,43.686134,-79.607682,,0,50
6886521,966056,167634,5,2019-02-13,36.128561,-115.171130,,2,7
6886522,264157,156531,1,2019-12-04,33.304062,-111.979869,,2,49
6886523,431787,166910,1,2012-07-31,36.112846,-115.225469,,1,31


In [43]:
df['weekday'] = df['date'].dt.dayofweek # get day of the week from date
df['weekend'] = (df['weekday'] == 6) | (df['weekday'] == 5)
df.head(20)

Unnamed: 0,user,item,stars,date,latitude,longitude,season,weekday,weeknumber,weekend
0,0,0,2,2015-04-15,36.112896,-115.177637,spring,2,16,False
1,1,0,2,2014-03-14,36.112896,-115.177637,winter,4,11,False
2,2,0,3,2015-10-07,36.112896,-115.177637,autumn,2,41,False
3,3,0,2,2015-11-18,36.112896,-115.177637,autumn,2,47,False
4,4,0,3,2010-10-10,36.112896,-115.177637,autumn,6,40,True
5,5,0,1,2015-08-20,36.112896,-115.177637,summer,3,34,False
6,6,0,5,2012-03-31,36.112896,-115.177637,spring,5,13,True
7,7,0,3,2013-05-09,36.112896,-115.177637,spring,3,19,False
8,8,0,2,2013-05-11,36.112896,-115.177637,spring,5,19,True
9,9,0,4,2010-10-09,36.112896,-115.177637,autumn,5,40,True


In [27]:
df['weeknumber'] = df['date'].dt.isocalendar().week # get week number
df

Unnamed: 0,user,item,stars,date,latitude,longitude,season,weekday,weeknumber
0,0,0,2,2015-04-15,36.112896,-115.177637,1.0,2,16
1,1,0,2,2014-03-14,36.112896,-115.177637,0.0,4,11
2,2,0,3,2015-10-07,36.112896,-115.177637,3.0,2,41
3,3,0,2,2015-11-18,36.112896,-115.177637,3.0,2,47
4,4,0,3,2010-10-10,36.112896,-115.177637,3.0,6,40
...,...,...,...,...,...,...,...,...,...
6886520,1836147,159135,1,2019-12-09,43.686134,-79.607682,,0,50
6886521,966056,167634,5,2019-02-13,36.128561,-115.171130,,2,7
6886522,264157,156531,1,2019-12-04,33.304062,-111.979869,,2,49
6886523,431787,166910,1,2012-07-31,36.112846,-115.225469,,1,31
