# IMPORTS

In [1]:
import dask.dataframe as dd
import pandas as pd
import os
import gc
from tabulate import tabulate

# CONFIGURATION

In [2]:
CHUNK_SIZE = 200000
CHUNK_ALLOWED = False

ORIGINAL_DATASET_DIR = 'data/original/yelp_dataset/'
CHUNK_DATASET_DIR = 'data/chunk/yelp_dataset/'
WORK_DATASET_DIR = 'data/work/yelp_dataset/'

PREFIX_YELP = 'yelp_academic_dataset_'

# FONCTIONS

In [3]:
def list_files(dir, extension='json', print_result = True):
    paths = {(file.split('.')[0]).split(PREFIX_YELP)[1]: dir + file for file in os.listdir(dir) if file.endswith('.' + extension)}
    if print_result:
        files = [[os.path.basename(paths[name]), file_size(paths[name])] for name in paths]
        print(tabulate(files, headers=['Name', 'Size']))
    return paths

def file_size(path, unit='Gb', precision=2, only_value = False):
    exp = 3
    if (unit=='ko'):
        exp = 1
    elif (unit == 'Mb'):
        exp = 2
    else:
        exp = 3
        unit = 'Gb'

    value = round((os.path.getsize(path)/(1024**exp)), precision)
    if only_value:
        return value
    else:
        return str(value) + unit

def chunk_file(name, files, chunksize = CHUNK_SIZE, destination = CHUNK_DATASET_DIR):
    path = files[name]
    basename = os.path.basename(path)
    
    joker_file = destination + name + '_*.parquet'

    print(f'Chunk: {basename}\t=>\t{joker_file}')

    if CHUNK_ALLOWED:
        index = 0
        for chunk in pd.read_json(path, lines=True, chunksize=chunksize):
            chunk_path = destination + name + '_' + str(index) + '.parquet'
            print(f'\t created chunk => {chunk_path}')
            chunk.to_parquet(chunk_path, engine='pyarrow')
            index = index + 1

    return joker_file

def get_datasets(files, chunks = [], dest_dir = CHUNK_DATASET_DIR, chunk_size = CHUNK_SIZE):
    
    # Chunck big files
    files_chunk  = {**files}
    for chunk in chunks:
        files_chunk = {**files, **{chunk: chunk_file(chunk, files)}}
        print('')

    # Create dataset dict
    dfs = {}
    infos = []

    for name in files_chunk:
        path = files_chunk[name]
        print(f'Chargement en cours: {name}')

        if '*' in path:
            infos.append([path, 'DASK', f'dfs[\'{name}\']'])
            dfs[name] = dd.read_parquet(path)
        else: 
            infos.append([path, 'PANDAS', f'dfs[\'{name}\']'])
            dfs[name] = pd.read_json(path, lines=True)

    print('')
    print(tabulate(infos, headers=['Fichier', 'Depuis', 'DataFrame']))

    return dfs

def preview_dataset(path):
    df = pd.read_json(path, lines=True, nrows=100)
    display(df.head())
    display(df.dtypes)

    return df

# PREVISUALISATION DES DATASETS

In [4]:
files = list_files(ORIGINAL_DATASET_DIR, print_result=False)
files

{'business': 'data/original/yelp_dataset/yelp_academic_dataset_business.json',
 'checkin': 'data/original/yelp_dataset/yelp_academic_dataset_checkin.json',
 'review': 'data/original/yelp_dataset/yelp_academic_dataset_review.json',
 'tip': 'data/original/yelp_dataset/yelp_academic_dataset_tip.json',
 'user': 'data/original/yelp_dataset/yelp_academic_dataset_user.json'}

In [5]:
preview_dfs = {}
for name in files:
    print(f'PREVIEW {name}\n=====================\n')
    preview_dfs[name] = preview_dataset(files[name])
    print('')

PREVIEW business



Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


PREVIEW checkin



Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


business_id    object
date           object
dtype: object


PREVIEW review



Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


review_id              object
user_id                object
business_id            object
stars                   int64
useful                  int64
funny                   int64
cool                    int64
text                   object
date           datetime64[ns]
dtype: object


PREVIEW tip



Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


user_id                     object
business_id                 object
text                        object
date                datetime64[ns]
compliment_count             int64
dtype: object


PREVIEW user



Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


user_id                object
name                   object
review_count            int64
yelping_since          object
useful                  int64
funny                   int64
cool                    int64
elite                  object
friends                object
fans                    int64
average_stars         float64
compliment_hot          int64
compliment_more         int64
compliment_profile      int64
compliment_cute         int64
compliment_list         int64
compliment_note         int64
compliment_plain        int64
compliment_cool         int64
compliment_funny        int64
compliment_writer       int64
compliment_photos       int64
dtype: object




In [6]:
preview_dfs['review']['stars'].unique()

array([3, 5, 4, 1, 2], dtype=int64)

In [7]:
preview_dfs['business']['categories'].unique()

array(['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists',
       'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
       'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores',
       'Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries',
       'Brewpubs, Breweries, Food',
       'Burgers, Fast Food, Sandwiches, Food, Ice Cream & Frozen Yogurt, Restaurants',
       'Sporting Goods, Fashion, Shoe Stores, Shopping, Sports Wear, Accessories',
       'Synagogues, Religious Organizations',
       'Pubs, Restaurants, Italian, Bars, American (Traditional), Nightlife, Greek',
       'Ice Cream & Frozen Yogurt, Fast Food, Burgers, Restaurants, Food',
       'Department Stores, Shopping, Fashion',
       'Vietnamese, Food, Restaurants, Food Trucks',
       'American (Traditional), Restaurants, Diners, Breakfast & Brunch',
       'General Dentistry, Dentists, Health & Medic

# RECUPERATION DES DATASETS

In [11]:
used_files = {**files}
used_files.pop('checkin', None)
used_files.pop('tip', None)
used_files.pop('user', None)

dfs = get_datasets(used_files, ['review'])

Chunk: yelp_academic_dataset_review.json	=>	data/chunk/yelp_dataset/review_*.parquet

Chargement en cours: business
Chargement en cours: review

Fichier                                                         Depuis    DataFrame
--------------------------------------------------------------  --------  ---------------
data/original/yelp_dataset/yelp_academic_dataset_business.json  PANDAS    dfs['business']
data/chunk/yelp_dataset/review_*.parquet                        DASK      dfs['review']


## ANALYSE DES CATEGORIES DE BUSINESS

In [12]:
dfs['business']['_cats'] = dfs['business']['categories'].apply(lambda x: [cat.strip() for cat in x.split(',')] if x else [])

In [13]:
categories = set()
for index in dfs['business'].index:
    categories.update(dfs['business'].loc[index, '_cats'])

In [14]:
[categorie for categorie in categories if 'restaurant' in categorie.lower()]

['Restaurant Supplies', 'Restaurants', 'Pop-Up Restaurants']

In [15]:
[categorie for categorie in categories if 'food' in categorie.lower()]

['Live/Raw Food',
 'Food Tours',
 'Food Delivery Services',
 'Food Stands',
 'Seafood',
 'Food Banks',
 'Comfort Food',
 'Do-It-Yourself Food',
 'Ethnic Food',
 'Food Court',
 'Seafood Markets',
 'Imported Food',
 'Food',
 'Fast Food',
 'Soul Food',
 'Food Trucks',
 'Specialty Food']

## FILTRAGE DES REVIEWS SUR CATEGORIE "RESTAURANTS"

In [16]:
restaurants_ids = dfs['business'][~dfs['business']['categories'].isna() & dfs['business']['categories'].str.contains('Restaurants')]['business_id'].to_list()

In [17]:
restaurant_reviews = dfs['review'][dfs['review']['business_id'].isin(restaurants_ids)]

In [18]:
review_len = len(dfs['review'])
review_len_restaurants= len(restaurant_reviews)

print(f'Il y a {review_len_restaurants} reviews de restanrants parmis les {review_len} reviews')

Il y a 4724471 reviews de restanrants parmis les 6990280 reviews


## REDUCTION DU DATAFRAME ET SAUVEGARDE

In [19]:
restaurant_reviews = restaurant_reviews.loc[:, ['stars', 'text']]

In [20]:
restaurant_reviews['stars'] = restaurant_reviews['stars'].astype('uint8')

In [22]:
restaurant_reviews.repartition(npartitions=1).to_parquet(WORK_DATASET_DIR, engine='pyarrow')

In [24]:
del restaurant_reviews, preview_dfs, dfs
gc.collect()

26

In [28]:
reviews = pd.read_parquet(WORK_DATASET_DIR + 'part.0.parquet', engine='pyarrow')

ArrowInvalid: offset overflow while concatenating arrays