In [1]:
import dask

# Make sure dask is latest version
assert dask.__version__ == "2.17.2"


### Instructions to use in Sagemaker

In order to use dask

1. Activate Conda Python 3 `source activate python3`
2. Install dask and s3fs `conda install dask s3fs -c conda-forge -y`

In [2]:
import boto3
import dask.dataframe as dd
from sagemaker import get_execution_role
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

role = get_execution_role()

pd.set_option('display.max_columns', None)

In [3]:
# Utility functions here

def create_s3_file_url(folder, file_name,bucket='tally-ai-dspt3'):
    return 's3://{}/{}/{}'.format(bucket, folder, file_name)


## Loading Yelp `business` data

In [4]:
folder = 'yelp-kaggle-raw-data'
file_name = 'yelp_academic_dataset_business.json'
s3_business = create_s3_file_url(folder, file_name)
print(f"Fetching data from {s3_business}")
business = dd.read_json(s3_business, lines=True, blocksize=32e6)
business.head()

Fetching data from s3://tally-ai-dspt3/yelp-kaggle-raw-data/yelp_academic_dataset_business.json


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,10913 Bailey Rd,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",f9NumwFMBDn751xgFiRbNA,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...",Cornelius,"{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",1,35.462724,-80.852612,The Range At Lake Norman,28031,36,3.5,NC
1,"8880 E Via Linda, Ste 107","{'GoodForKids': 'True', 'ByAppointmentOnly': '...",Yzvjg0SayhoZgCljUJRF9Q,"Health & Medical, Fitness & Instruction, Yoga,...",Scottsdale,,1,33.569404,-111.890264,"Carlos Santo, NMD",85258,4,5.0,AZ
2,3554 Rue Notre-Dame O,,XNoUzKckATkOD1hP6vghZg,"Pets, Pet Services, Pet Groomers",Montreal,,1,45.479984,-73.58007,Felinus,H4C 1P4,5,5.0,QC
3,1015 Sharp Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",6OAZjbxqM5ol29BuHsil3w,"Hardware Stores, Home Services, Building Suppl...",North Las Vegas,"{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ...",0,36.219728,-115.127725,Nevada House of Hose,89030,3,2.5,NV
4,4827 E Downing Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",51M2Kk903DFYI6gnB5I6SQ,"Home Services, Plumbing, Electricians, Handyma...",Mesa,"{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",1,33.428065,-111.726648,USE MY GUY SERVICES LLC,85205,26,4.5,AZ


### Dropping Missing Values

Before dropping na in `categories`

In [5]:
business.compute().shape

(209393, 14)

After dropping na in categories

In [6]:
business = business.dropna(subset=['categories'])
business.compute().shape

(208869, 14)

### List of Unique Categories

In [7]:
business["categories"].unique().compute()

0         Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...
1         Health & Medical, Fitness & Instruction, Yoga,...
2                          Pets, Pet Services, Pet Groomers
3         Hardware Stores, Home Services, Building Suppl...
4         Home Services, Plumbing, Electricians, Handyma...
5         Auto Repair, Automotive, Oil Change Stations, ...
6         Dry Cleaning & Laundry, Local Services, Laundr...
7         Auto Repair, Oil Change Stations, Automotive, ...
8         Ethnic Food, Food Trucks, Specialty Food, Impo...
9         Martial Arts, Gyms, Fitness & Instruction, Act...
10                  Contractors, Landscaping, Home Services
11                                  Automotive, Auto Repair
12                Desserts, Food, Ice Cream & Frozen Yogurt
13               Contractors, Home Services, Local Services
14                                   Beauty & Spas, Tanning
15                             Local Services, Self Storage
16                           Shopping, S

## Filtering `restaurants` only

Before filtering for `restaurants` businesses, total unique businesses are

In [11]:
business["categories"].unique().compute().shape

(102494,)

In [8]:
restaurants = business[business["categories"].str.contains("Restaurants")]
restaurants.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
8,404 E Green St,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...",pQeaRpvuhoEqudo3uymHIQ,"Ethnic Food, Food Trucks, Specialty Food, Impo...",Champaign,"{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",1,40.110446,-88.233073,The Empanadas House,61820,5,4.5,IL
20,4508 E Independence Blvd,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...",CsLQLiRoafpJPJSkNX2h5Q,"Food, Restaurants, Grocery, Middle Eastern",Charlotte,,0,35.194894,-80.767442,Middle East Deli,28205,5,3.0,NC
24,"15480 Bayview Avenue, unit D0110","{'RestaurantsTableService': 'False', 'Restaura...",eBEfgOPG7pvFhb2wcG9I7w,"Restaurants, Cheesesteaks, Poutineries",Aurora,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,44.010962,-79.448677,Philthy Phillys,L4G 7J1,4,4.5,ON
25,300 John Street,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...",lu7vtrp_bE9PnxWfA8g4Pg,"Japanese, Fast Food, Food Court, Restaurants",Thornhill,,1,43.820492,-79.398466,Banzai Sushi,L3T 5W4,7,4.5,ON
30,13071 Yonge Street,"{'Ambience': '{'touristy': False, 'hipster': F...",9sRGfSVEfLhN_km60YruTA,"Persian/Iranian, Turkish, Middle Eastern, Rest...",Richmond Hill,"{'Tuesday': '12:0-21:0', 'Wednesday': '12:0-21...",1,43.947011,-79.454862,Apadana Restaurant,L4E 1A5,3,3.0,ON


After filtering for `restaurants` total unique businesses dropped

In [9]:
restaurants.compute().shape

(63944, 14)

### Rename `stars` to `business_stars`

In [25]:
restaurants['business_stars'] = restaurants['stars']
restaurants = restaurants.drop("stars", axis=1)
restaurants.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,state,business_stars
8,404 E Green St,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...",pQeaRpvuhoEqudo3uymHIQ,"Ethnic Food, Food Trucks, Specialty Food, Impo...",Champaign,"{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",1,40.110446,-88.233073,The Empanadas House,61820,5,IL,4.5
20,4508 E Independence Blvd,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...",CsLQLiRoafpJPJSkNX2h5Q,"Food, Restaurants, Grocery, Middle Eastern",Charlotte,,0,35.194894,-80.767442,Middle East Deli,28205,5,NC,3.0
24,"15480 Bayview Avenue, unit D0110","{'RestaurantsTableService': 'False', 'Restaura...",eBEfgOPG7pvFhb2wcG9I7w,"Restaurants, Cheesesteaks, Poutineries",Aurora,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,44.010962,-79.448677,Philthy Phillys,L4G 7J1,4,ON,4.5
25,300 John Street,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...",lu7vtrp_bE9PnxWfA8g4Pg,"Japanese, Fast Food, Food Court, Restaurants",Thornhill,,1,43.820492,-79.398466,Banzai Sushi,L3T 5W4,7,ON,4.5
30,13071 Yonge Street,"{'Ambience': '{'touristy': False, 'hipster': F...",9sRGfSVEfLhN_km60YruTA,"Persian/Iranian, Turkish, Middle Eastern, Rest...",Richmond Hill,"{'Tuesday': '12:0-21:0', 'Wednesday': '12:0-21...",1,43.947011,-79.454862,Apadana Restaurant,L4E 1A5,3,ON,3.0


## Exporting  `business` as CSV to S3

In [26]:
file_name = 'business.csv'
folder = 'yelp-restaurants'
s3_file_name = create_s3_file_url(folder, file_name)
print(f"Storing to {s3_file_name}")
restaurants.to_csv(s3_file_name, single_file = True)

Storing to s3://tally-ai-dspt3/yelp-restaurants/business.csv


['tally-ai-dspt3/yelp-restaurants/business.csv']

## Loading Yelp `reviews` ~ 6GB data

In [13]:
folder = 'yelp-kaggle-raw-data'
file_name = 'yelp_academic_dataset_review.json'
s3_reviews = create_s3_file_url(folder, file_name)
print(f"Fetching data from {s3_reviews}")
reviews = dd.read_json(s3_reviews, blocksize=32e6)
reviews.head()

Fetching data from s3://tally-ai-dspt3/yelp-kaggle-raw-data/yelp_academic_dataset_review.json


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,-MhfebM0QIsKt87iDN-FNw,0,2015-04-15 05:21:16,0,xQY8N_XvtGbearJ5X4QryQ,2,"As someone who has worked with many museums, I...",5,OwjRMXRC0KyPrIlcjaXeFQ
1,lbrU8StCq3yDfr-QMnGrmQ,0,2013-12-07 03:16:52,1,UmFMZ8PyXZTY2QcwzsfQYA,1,I am actually horrified this place is still in...,1,nIJD_7ZXHq-FX8byPMOkMQ
2,HQl28KMwrEKHqhFrrDqVNQ,0,2015-12-05 03:18:11,0,LG2ZaYiOgpr2DK_90pYjNw,5,I love Deagan's. I do. I really do. The atmosp...,1,V34qejxNsCbcgD8C0HVk-Q
3,5JxlZaqCnk1MnbgRirs40Q,0,2011-05-27 05:30:52,0,i6g_oA9Yf9Y31qt0wibXpw,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0,ofKDkJKXSKZXu5xJNGiiBQ
4,IS4cv902ykd8wj1TR0N3-A,0,2017-01-14 21:56:57,0,6TdNDKywdbjoTkizeMce8A,4,"Oh happy day, finally have a Canes near my cas...",0,UgMW8bLE0QMJDCkQ1Ax5Mg


#### Total Reviews before merging

In [14]:
reviews.compute().shape

(8021122, 9)

## Merging `business` and `reviews` data

In [15]:
restaurants_reviews = restaurants.merge(reviews, on="business_id",  how = 'inner')
restaurants_reviews.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars_x,state,cool,date,funny,review_id,stars_y,text,useful,user_id
0,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,3,2015-09-24 01:48:06,0,ueAiZ8f-jH7gKKuG0Bv3Gw,5,The most fantastic Italian food in Madison---d...,3,tdS2jvf9LPfE4XS5Kka9sw
1,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,2,2016-02-12 15:07:33,0,7zxEvUQwjXLl3vGl3aYhjQ,5,Naples 15 is wonderful and Chef Salvatore is a...,1,DBtHPn6TOBpXq6BAuFdcow
2,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,1,2015-05-13 05:20:33,0,ljP15ya0teoDyZz1Ms1rqQ,4,My friends and I decided to check out Naples 1...,1,8EipJXehR14DD9x9N7RSSw
3,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,0,2018-05-29 02:52:02,0,h2bXl2byo--CaBuf8QMUfA,5,I went to Naples 15 on the Friday of Memorial ...,1,6GC4KK_FW1pMGXt7qDUQGg
4,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,0,2018-06-09 11:07:48,0,rxO7gNmdpjamcoHPCfdmZg,1,"Food is pricey, quantity moderate and quality ...",1,lX-V2SVG2rWtkeW6LRuCpg


#### Total Restaurant Reviews after merging

In [16]:
restaurants_reviews.compute().shape

(5055992, 22)

## Filtering out columns from Restaurant Reviews

In [19]:
reviews[reviews["business_id"] == "5p5YW_5bMOSuLIQjkJPO3w"].compute()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
6122,5p5YW_5bMOSuLIQjkJPO3w,3,2015-09-24 01:48:06,0,ueAiZ8f-jH7gKKuG0Bv3Gw,5,The most fantastic Italian food in Madison---d...,3,tdS2jvf9LPfE4XS5Kka9sw
20739,5p5YW_5bMOSuLIQjkJPO3w,2,2016-02-12 15:07:33,0,7zxEvUQwjXLl3vGl3aYhjQ,5,Naples 15 is wonderful and Chef Salvatore is a...,1,DBtHPn6TOBpXq6BAuFdcow
25501,5p5YW_5bMOSuLIQjkJPO3w,1,2015-05-13 05:20:33,0,ljP15ya0teoDyZz1Ms1rqQ,4,My friends and I decided to check out Naples 1...,1,8EipJXehR14DD9x9N7RSSw
26200,5p5YW_5bMOSuLIQjkJPO3w,0,2018-05-29 02:52:02,0,h2bXl2byo--CaBuf8QMUfA,5,I went to Naples 15 on the Friday of Memorial ...,1,6GC4KK_FW1pMGXt7qDUQGg
27315,5p5YW_5bMOSuLIQjkJPO3w,0,2018-06-09 11:07:48,0,rxO7gNmdpjamcoHPCfdmZg,1,"Food is pricey, quantity moderate and quality ...",1,lX-V2SVG2rWtkeW6LRuCpg
31778,5p5YW_5bMOSuLIQjkJPO3w,2,2016-10-01 00:39:02,0,HWN6IRFZm9NFDwpezGCVNA,4,"I've been to this restaurant a few times now, ...",2,aZdsBvIJR03Vm99mKYBiew
37089,5p5YW_5bMOSuLIQjkJPO3w,0,2018-09-14 15:03:39,0,EAFUfuXcBIVlXNKrgkwEwQ,5,Fantastic Italian food. I am normally skeptica...,1,4oH2DWaYEn8bBFW9w8bScQ
6271,5p5YW_5bMOSuLIQjkJPO3w,0,2013-10-26 01:42:06,1,41CAcEh3lnhCL-Azg1cbRw,5,I had noticed a sign for Naples15 one time whe...,4,rEdLHGNhCfJXSFq4SQXhkQ
13663,5p5YW_5bMOSuLIQjkJPO3w,0,2015-07-17 14:50:55,0,w-tAhWGOV3jFG6FHiz9qPg,5,One of the top 5 meals in my life. The atmosp...,3,LMUn-OsM_AdIipX2CflLgQ
14634,5p5YW_5bMOSuLIQjkJPO3w,0,2017-10-26 00:50:17,0,Rsuzk8b1Jp6d6FAsh9iuYw,2,We had a Groupon for this restaurant and we w...,1,ffA7RP_FY9rKpTi4DRdWeA


In [20]:
restaurants_reviews[(restaurants_reviews["business_id"] == "5p5YW_5bMOSuLIQjkJPO3w") & (restaurants_reviews["review_id"] == "ueAiZ8f-jH7gKKuG0Bv3Gw")].compute()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars_x,state,cool,date,funny,review_id,stars_y,text,useful,user_id
0,15 N Butler St,"{'BusinessAcceptsCreditCards': 'True', 'Busine...",5p5YW_5bMOSuLIQjkJPO3w,"Restaurants, Pizza, Italian, American (New)",Madison,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",1,43.077208,-89.381527,Naples 15,53703,247,4.0,WI,3,2015-09-24 01:48:06,0,ueAiZ8f-jH7gKKuG0Bv3Gw,5,The most fantastic Italian food in Madison---d...,3,tdS2jvf9LPfE4XS5Kka9sw


In [28]:
restaurants_reviews = restaurants_reviews.rename(columns={'stars_y': 'review_stars'})

In [29]:
cols_to_keep = ["review_id", "business_id", "review_stars", "date", "text"]
restaurants_reviews = restaurants_reviews[cols_to_keep]
restaurants_reviews.head()

Unnamed: 0,review_id,business_id,review_stars,date,text
0,41CAcEh3lnhCL-Azg1cbRw,5p5YW_5bMOSuLIQjkJPO3w,5,2013-10-26 01:42:06,I had noticed a sign for Naples15 one time whe...
1,w-tAhWGOV3jFG6FHiz9qPg,5p5YW_5bMOSuLIQjkJPO3w,5,2015-07-17 14:50:55,One of the top 5 meals in my life. The atmosp...
2,Rsuzk8b1Jp6d6FAsh9iuYw,5p5YW_5bMOSuLIQjkJPO3w,2,2017-10-26 00:50:17,We had a Groupon for this restaurant and we w...
3,fvhaYWPyjRBK64xh62cpMg,5p5YW_5bMOSuLIQjkJPO3w,1,2014-06-13 00:42:39,Are you kidding me? Their prices are outrageo...
4,dtMfGwj0qcWIwkMORFuiIw,5p5YW_5bMOSuLIQjkJPO3w,5,2016-05-07 20:38:55,"This place is awesome. Salvatore Di Scala, own..."


## Exporting `reviews` as CSV to S3

In [None]:
file_name = 'reviews.csv'
folder = 'yelp-restaurants'
s3_file_name = create_s3_file_url(folder, file_name)
print(f"Storing to {s3_file_name}")
restaurants_reviews.to_csv(s3_file_name, single_file = True)

Storing to s3://tally-ai-dspt3/yelp-restaurants/reviews.csv
