In [1]:
# Make sure dask its latest Dask version
import dask
assert dask.__version__ == "2.17.2"

### Instructions to use in Sagemaker

In order to use dask

1. Activate Conda Python 3 `source activate python3`
2. Install dask and s3fs `conda install dask s3fs -c conda-forge -y`

In [2]:
import boto3
import dask.dataframe as dd
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fastparquet import ParquetFile

role = get_execution_role()

pd.set_option('display.max_columns', None)

In [3]:
# Utility functions here

def create_s3_file_url(folder, file_name,bucket='tally-ai-dspt3'):
    return 's3://{}/{}/{}'.format(bucket, folder, file_name)


## Loading Yelp `business` data

In [4]:
folder = 'yelp-kaggle-raw-data'
file_name = 'yelp_academic_dataset_business.json'
s3_business = create_s3_file_url(folder, file_name)
print(f"Fetching data from {s3_business}")
business = dd.read_json(s3_business, lines=True, blocksize=32e6)
business.head()

Fetching data from s3://tally-ai-dspt3/yelp-kaggle-raw-data/yelp_academic_dataset_business.json


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,10913 Bailey Rd,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",f9NumwFMBDn751xgFiRbNA,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...",Cornelius,"{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",1,35.462724,-80.852612,The Range At Lake Norman,28031,36,3.5,NC
1,"8880 E Via Linda, Ste 107","{'GoodForKids': 'True', 'ByAppointmentOnly': '...",Yzvjg0SayhoZgCljUJRF9Q,"Health & Medical, Fitness & Instruction, Yoga,...",Scottsdale,,1,33.569404,-111.890264,"Carlos Santo, NMD",85258,4,5.0,AZ
2,3554 Rue Notre-Dame O,,XNoUzKckATkOD1hP6vghZg,"Pets, Pet Services, Pet Groomers",Montreal,,1,45.479984,-73.58007,Felinus,H4C 1P4,5,5.0,QC
3,1015 Sharp Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",6OAZjbxqM5ol29BuHsil3w,"Hardware Stores, Home Services, Building Suppl...",North Las Vegas,"{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ...",0,36.219728,-115.127725,Nevada House of Hose,89030,3,2.5,NV
4,4827 E Downing Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",51M2Kk903DFYI6gnB5I6SQ,"Home Services, Plumbing, Electricians, Handyma...",Mesa,"{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",1,33.428065,-111.726648,USE MY GUY SERVICES LLC,85205,26,4.5,AZ


### Dropping Missing Values

Before dropping na in `categories`

In [5]:
business.compute().shape

(209393, 14)

After dropping na in categories

In [6]:
business = business.dropna(subset=['categories'])
business.compute().shape

(208869, 14)

### List of Unique Categories

In [7]:
business["categories"].unique().compute()

0         Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...
1         Health & Medical, Fitness & Instruction, Yoga,...
2                          Pets, Pet Services, Pet Groomers
3         Hardware Stores, Home Services, Building Suppl...
4         Home Services, Plumbing, Electricians, Handyma...
5         Auto Repair, Automotive, Oil Change Stations, ...
6         Dry Cleaning & Laundry, Local Services, Laundr...
7         Auto Repair, Oil Change Stations, Automotive, ...
8         Ethnic Food, Food Trucks, Specialty Food, Impo...
9         Martial Arts, Gyms, Fitness & Instruction, Act...
10                  Contractors, Landscaping, Home Services
11                                  Automotive, Auto Repair
12                Desserts, Food, Ice Cream & Frozen Yogurt
13               Contractors, Home Services, Local Services
14                                   Beauty & Spas, Tanning
15                             Local Services, Self Storage
16                           Shopping, S

## Filtering `restaurants` only

Before filtering for `restaurants` businesses, total unique businesses are

In [8]:
business["categories"].unique().compute().shape

(102494,)

In [9]:
restaurants = business[business["categories"].str.contains("Restaurants")]
restaurants.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
8,404 E Green St,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...",pQeaRpvuhoEqudo3uymHIQ,"Ethnic Food, Food Trucks, Specialty Food, Impo...",Champaign,"{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",1,40.110446,-88.233073,The Empanadas House,61820,5,4.5,IL
20,4508 E Independence Blvd,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...",CsLQLiRoafpJPJSkNX2h5Q,"Food, Restaurants, Grocery, Middle Eastern",Charlotte,,0,35.194894,-80.767442,Middle East Deli,28205,5,3.0,NC
24,"15480 Bayview Avenue, unit D0110","{'RestaurantsTableService': 'False', 'Restaura...",eBEfgOPG7pvFhb2wcG9I7w,"Restaurants, Cheesesteaks, Poutineries",Aurora,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,44.010962,-79.448677,Philthy Phillys,L4G 7J1,4,4.5,ON
25,300 John Street,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...",lu7vtrp_bE9PnxWfA8g4Pg,"Japanese, Fast Food, Food Court, Restaurants",Thornhill,,1,43.820492,-79.398466,Banzai Sushi,L3T 5W4,7,4.5,ON
30,13071 Yonge Street,"{'Ambience': '{'touristy': False, 'hipster': F...",9sRGfSVEfLhN_km60YruTA,"Persian/Iranian, Turkish, Middle Eastern, Rest...",Richmond Hill,"{'Tuesday': '12:0-21:0', 'Wednesday': '12:0-21...",1,43.947011,-79.454862,Apadana Restaurant,L4E 1A5,3,3.0,ON


After filtering for `restaurants` total unique businesses dropped

In [10]:
restaurants.compute().shape

(63944, 14)

### Rename `stars` to `business_stars`

In [11]:
restaurants['business_stars'] = restaurants['stars']
restaurants = restaurants.drop("stars", axis=1)
restaurants.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,state,business_stars
8,404 E Green St,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...",pQeaRpvuhoEqudo3uymHIQ,"Ethnic Food, Food Trucks, Specialty Food, Impo...",Champaign,"{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",1,40.110446,-88.233073,The Empanadas House,61820,5,IL,4.5
20,4508 E Independence Blvd,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...",CsLQLiRoafpJPJSkNX2h5Q,"Food, Restaurants, Grocery, Middle Eastern",Charlotte,,0,35.194894,-80.767442,Middle East Deli,28205,5,NC,3.0
24,"15480 Bayview Avenue, unit D0110","{'RestaurantsTableService': 'False', 'Restaura...",eBEfgOPG7pvFhb2wcG9I7w,"Restaurants, Cheesesteaks, Poutineries",Aurora,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,44.010962,-79.448677,Philthy Phillys,L4G 7J1,4,ON,4.5
25,300 John Street,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...",lu7vtrp_bE9PnxWfA8g4Pg,"Japanese, Fast Food, Food Court, Restaurants",Thornhill,,1,43.820492,-79.398466,Banzai Sushi,L3T 5W4,7,ON,4.5
30,13071 Yonge Street,"{'Ambience': '{'touristy': False, 'hipster': F...",9sRGfSVEfLhN_km60YruTA,"Persian/Iranian, Turkish, Middle Eastern, Rest...",Richmond Hill,"{'Tuesday': '12:0-21:0', 'Wednesday': '12:0-21...",1,43.947011,-79.454862,Apadana Restaurant,L4E 1A5,3,ON,3.0


## Exporting  `business` as CSV to S3

In [12]:
file_name = 'business.csv'
folder = 'yelp-restaurants'
s3_file_name = create_s3_file_url(folder, file_name)
print(f"Storing to {s3_file_name}")
restaurants.to_csv(s3_file_name, single_file = True)

Storing to s3://tally-ai-dspt3/yelp-restaurants/business.csv


  warn("Appending data to a network storage system may not work.")


['tally-ai-dspt3/yelp-restaurants/business.csv']

## Loading Yelp `reviews` ~ 6GB data

In [13]:
folder = 'yelp-kaggle-raw-data'
file_name = 'yelp_academic_dataset_review.json'
s3_reviews = create_s3_file_url(folder, file_name)
print(f"Fetching data from {s3_reviews}")
reviews = dd.read_json(s3_reviews, blocksize=32e6)
reviews.head()

Fetching data from s3://tally-ai-dspt3/yelp-kaggle-raw-data/yelp_academic_dataset_review.json


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,-MhfebM0QIsKt87iDN-FNw,0,2015-04-15 05:21:16,0,xQY8N_XvtGbearJ5X4QryQ,2,"As someone who has worked with many museums, I...",5,OwjRMXRC0KyPrIlcjaXeFQ
1,lbrU8StCq3yDfr-QMnGrmQ,0,2013-12-07 03:16:52,1,UmFMZ8PyXZTY2QcwzsfQYA,1,I am actually horrified this place is still in...,1,nIJD_7ZXHq-FX8byPMOkMQ
2,HQl28KMwrEKHqhFrrDqVNQ,0,2015-12-05 03:18:11,0,LG2ZaYiOgpr2DK_90pYjNw,5,I love Deagan's. I do. I really do. The atmosp...,1,V34qejxNsCbcgD8C0HVk-Q
3,5JxlZaqCnk1MnbgRirs40Q,0,2011-05-27 05:30:52,0,i6g_oA9Yf9Y31qt0wibXpw,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0,ofKDkJKXSKZXu5xJNGiiBQ
4,IS4cv902ykd8wj1TR0N3-A,0,2017-01-14 21:56:57,0,6TdNDKywdbjoTkizeMce8A,4,"Oh happy day, finally have a Canes near my cas...",0,UgMW8bLE0QMJDCkQ1Ax5Mg


#### Total Reviews before merging

In [14]:
reviews.compute().shape

(8021122, 9)

### Rename `stars` to `reviews_stars`

In [16]:
reviews['reviews_stars'] = reviews['stars']
reviews = reviews.drop("stars", axis=1)
reviews.head()

Unnamed: 0,business_id,cool,date,funny,review_id,text,useful,user_id,reviews_stars
0,-MhfebM0QIsKt87iDN-FNw,0,2015-04-15 05:21:16,0,xQY8N_XvtGbearJ5X4QryQ,"As someone who has worked with many museums, I...",5,OwjRMXRC0KyPrIlcjaXeFQ,2
1,lbrU8StCq3yDfr-QMnGrmQ,0,2013-12-07 03:16:52,1,UmFMZ8PyXZTY2QcwzsfQYA,I am actually horrified this place is still in...,1,nIJD_7ZXHq-FX8byPMOkMQ,1
2,HQl28KMwrEKHqhFrrDqVNQ,0,2015-12-05 03:18:11,0,LG2ZaYiOgpr2DK_90pYjNw,I love Deagan's. I do. I really do. The atmosp...,1,V34qejxNsCbcgD8C0HVk-Q,5
3,5JxlZaqCnk1MnbgRirs40Q,0,2011-05-27 05:30:52,0,i6g_oA9Yf9Y31qt0wibXpw,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0,ofKDkJKXSKZXu5xJNGiiBQ,1
4,IS4cv902ykd8wj1TR0N3-A,0,2017-01-14 21:56:57,0,6TdNDKywdbjoTkizeMce8A,"Oh happy day, finally have a Canes near my cas...",0,UgMW8bLE0QMJDCkQ1Ax5Mg,4


## Merging `business` and `reviews` data

In [17]:
restaurants_reviews = restaurants.merge(reviews, on="business_id",  how = 'inner')
restaurants_reviews.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,state,business_stars,cool,date,funny,review_id,text,useful,user_id,reviews_stars
0,7119 E Shea Blvd,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",S9RoY_Smsh0a2JPo90bkdg,"Burgers, Restaurants, Sandwiches, Salad, Break...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '6:30-14:30',...",1,33.580987,-111.927628,US Egg Restaurant,85254,164,AZ,4.0,1,2016-09-17 22:23:31,0,kurj_J7DPWGHOdrw-O-Dcw,"No fuss breakfast, roomy and spacious. Good se...",1,z_tqPytGQF_tDw_buHKJcw,4
1,7119 E Shea Blvd,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",S9RoY_Smsh0a2JPo90bkdg,"Burgers, Restaurants, Sandwiches, Salad, Break...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '6:30-14:30',...",1,33.580987,-111.927628,US Egg Restaurant,85254,164,AZ,4.0,0,2016-07-24 18:23:33,0,fsmC1XEZ0n-WXwNaPgGqlQ,I got the garden omelet when out for breakfast...,0,KVXnwCD3RM09IPUHsU0e4Q,1
2,7119 E Shea Blvd,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",S9RoY_Smsh0a2JPo90bkdg,"Burgers, Restaurants, Sandwiches, Salad, Break...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '6:30-14:30',...",1,33.580987,-111.927628,US Egg Restaurant,85254,164,AZ,4.0,0,2018-06-18 04:52:09,0,LKAwfY7o9rRJuOC1bhWzgQ,Update: My card was billed (even though they ...,0,PGas3x06gHXGQITDMnFyow,1
3,7119 E Shea Blvd,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",S9RoY_Smsh0a2JPo90bkdg,"Burgers, Restaurants, Sandwiches, Salad, Break...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '6:30-14:30',...",1,33.580987,-111.927628,US Egg Restaurant,85254,164,AZ,4.0,0,2018-04-02 21:10:15,0,QQ0Iw4lphpQGOfkZxGwmhw,Us egg is a fantastic place to eat the people ...,1,BLuiZbumq1whr5r7mlnZJA,5
4,7119 E Shea Blvd,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",S9RoY_Smsh0a2JPo90bkdg,"Burgers, Restaurants, Sandwiches, Salad, Break...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '6:30-14:30',...",1,33.580987,-111.927628,US Egg Restaurant,85254,164,AZ,4.0,1,2017-02-25 20:40:05,0,K3wEqye7Rm6v4ff_8tKDOg,We were looking for a breakfast spot with pati...,1,3AJRBXUUgWx0rLsFa_r0gw,5


#### Total Restaurant Reviews after merging

In [18]:
restaurants_reviews.compute().shape

(5055992, 22)

#### Filtering out columns from Restaurant Reviews

In [19]:
cols_to_keep = ["review_id", "business_id", "reviews_stars", "date", "text"]
restaurants_reviews = restaurants_reviews[cols_to_keep]
restaurants_reviews.head()

Unnamed: 0,review_id,business_id,reviews_stars,date,text
0,dQmkOHSHFcXrvFKZ0h09rg,hEtb2D81MtRfKuAd1FdV6w,4,2009-03-08 21:31:58,Ordered 2 pizzas and 2 salads yesterday since ...
1,c0nFkxV4Ejmg9tsW-Kiusw,hEtb2D81MtRfKuAd1FdV6w,5,2008-09-07 21:16:32,It's a hole-ine-the-wall pizza place that does...
2,2y871RzqJOue5Z7L8DY38Q,hEtb2D81MtRfKuAd1FdV6w,1,2015-09-11 02:54:23,1brought my sibling here for her birthday and ...
3,SsMTHrx3GNJI2h32H9Ft-g,hEtb2D81MtRfKuAd1FdV6w,5,2012-02-08 01:34:53,Hands down still the best pizza around. I've ...
4,bAbfPJpwk8MlyP-IxcPnCw,hEtb2D81MtRfKuAd1FdV6w,4,2015-07-28 20:23:08,"After reading some of the reviews, I've realiz..."


### NOTE: Saving the `restaurants_reviews` dask dataframe to `.csv` file failed due to large size.

### Exporting `reviews` as fastparquet to S3

In [20]:
file_name = 'reviews.parquet.gzip'
folder = 'yelp-restaurants'
s3_file_name = create_s3_file_url(folder, file_name)
print(f"Storing to {s3_file_name}")
restaurants_reviews.to_parquet(s3_file_name,  compression="gzip")

Storing to s3://tally-ai-dspt3/yelp-restaurants/reviews.parquet.gzip
