In [1]:
import boto3
import dask.dataframe as dd
from sagemaker import get_execution_role
import pandas as pd


role = get_execution_role()
bucket='tally-ai-dspt3'
folder = 'yelp-kaggle-raw-data'

print(f"S3 Bucket is {bucket}, and Folder is {folder}")

S3 Bucket is tally-ai-dspt3, and Folder is yelp-kaggle-raw-data


In [2]:
# follow the following steps below verbatim and open in terminal to run them except for kernel restart which is at menu
# source activate python3
# conda install dask -y
# conda install s3fs -c conda-forge -y
# restart kernel
#Note, to run parquet which is a way to export to S3 with highly reduces dask files, you need to add the following command:
# conda install -c conda-forge fastparquet

# Loading Yelp Businees `attributes`

In [3]:
data_key = 'yelp_academic_dataset_business.json'
data_location = 's3://{}/{}/{}'.format(bucket, folder, data_key)
business = dd.read_json(data_location, lines=True)
business.head(10)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,10913 Bailey Rd,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...",f9NumwFMBDn751xgFiRbNA,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...",Cornelius,"{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",1,35.462724,-80.852612,The Range At Lake Norman,28031,36,3.5,NC
1,"8880 E Via Linda, Ste 107","{'GoodForKids': 'True', 'ByAppointmentOnly': '...",Yzvjg0SayhoZgCljUJRF9Q,"Health & Medical, Fitness & Instruction, Yoga,...",Scottsdale,,1,33.569404,-111.890264,"Carlos Santo, NMD",85258,4,5.0,AZ
2,3554 Rue Notre-Dame O,,XNoUzKckATkOD1hP6vghZg,"Pets, Pet Services, Pet Groomers",Montreal,,1,45.479984,-73.58007,Felinus,H4C 1P4,5,5.0,QC
3,1015 Sharp Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",6OAZjbxqM5ol29BuHsil3w,"Hardware Stores, Home Services, Building Suppl...",North Las Vegas,"{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ...",0,36.219728,-115.127725,Nevada House of Hose,89030,3,2.5,NV
4,4827 E Downing Cir,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",51M2Kk903DFYI6gnB5I6SQ,"Home Services, Plumbing, Electricians, Handyma...",Mesa,"{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",1,33.428065,-111.726648,USE MY GUY SERVICES LLC,85205,26,4.5,AZ
5,"1720 W Elliot Rd, Ste 105",{'BusinessAcceptsCreditCards': 'True'},cKyLV5oWZJ2NudWgqs8VZw,"Auto Repair, Automotive, Oil Change Stations, ...",Gilbert,"{'Monday': '7:0-18:0', 'Tuesday': '7:0-18:0', ...",1,33.350399,-111.827142,Oasis Auto Center - Gilbert,85233,38,4.5,AZ
6,"6870 S Rainbow Blvd, Ste 117","{'BusinessParking': '{'garage': False, 'street...",oiAlXZPIFm2nBCt0DHLu_Q,"Dry Cleaning & Laundry, Local Services, Laundr...",Las Vegas,"{'Monday': '7:0-19:0', 'Tuesday': '7:0-19:0', ...",1,36.063977,-115.241463,Green World Cleaners,89118,81,3.5,NV
7,6910 E Southern Ave,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...",ScYkbYNkDgCneBrD9vqhCQ,"Auto Repair, Oil Change Stations, Automotive, ...",Mesa,"{'Monday': '7:30-17:0', 'Tuesday': '7:30-17:0'...",1,33.393885,-111.682226,Junction Tire & Auto Service,85209,18,5.0,AZ
8,404 E Green St,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...",pQeaRpvuhoEqudo3uymHIQ,"Ethnic Food, Food Trucks, Specialty Food, Impo...",Champaign,"{'Monday': '11:30-14:30', 'Tuesday': '11:30-14...",1,40.110446,-88.233073,The Empanadas House,61820,5,4.5,IL
9,700 Kipling Avenue Etobicoke,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...",EosRKXIGeSWFYWwpkbhNnA,"Martial Arts, Gyms, Fitness & Instruction, Act...",Toronto,"{'Monday': '5:30-23:0', 'Tuesday': '5:30-23:0'...",1,43.624539,-79.529108,Xtreme Couture,M8Z 5G3,16,3.0,ON


In order to use dask

1. Activate Conda Python 3 `source activate python3`
2. Install dask and s3fs `conda install dask s3fs -c conda-forge -y`

# Loading Yelp Businees `reviews` ~ 6GB data

In [4]:
#uncomment below to spin reviews which will be too large. Suggested we create a subset on RDS/postgres and then spin only what's necessary
data_key = 'yelp_academic_dataset_review.json'
data_location = 's3://{}/{}/{}'.format(bucket, folder, data_key)
reviews = dd.read_json(data_location, blocksize=32e6)

In [5]:
#reviews.head()

In [6]:
#need to join the three datasets together. Dask join is just like dask
#once joined, filter with restaurants
#once filtered, only have restaurants in database, export as .csv
#once have .csv file load to S3 bucket

In [7]:
#uncomment below to spin reviews which will be too large. Suggested we create a subset on RDS/postgres and then spin only what's necessary
data_key = 'yelp_academic_dataset_user.json'
data_location = 's3://{}/{}/{}'.format(bucket, folder, data_key)
users = dd.read_json(data_location, blocksize=32e6)

In [8]:
#users.head()

In [9]:
#business_reviews = business.merge(reviews, on = 'business_id', how = 'inner')

In [10]:
#business_reviews.head()

In [11]:
#business_reviews_users = business_reviews.merge(users, on = 'user_id', how = 'inner')

In [12]:
#business_reviews_users.head()

In [13]:
#method to pull unique values out of column 'category' in business
# def unique_col(col):
#     return ','.join(set(col.split(',')))

# x = business.categories.apply(unique_col)

In [14]:
#drop na values
business2 = business.dropna(subset=['categories'])

In [15]:
restaurants = business2[business2['categories'].str.contains('Restaurants')]

In [16]:
#restaurants.head()

In [17]:
restaurants_reviews = restaurants.merge(reviews, on = 'business_id', how = 'inner')

In [18]:
#restaurants_reviews.head()

In [19]:
restaurants_reviews_users = restaurants_reviews.merge(users, on = 'user_id', how = 'inner')

In [20]:
pd.set_option('display.max_columns', None)

In [21]:
#restaurants_reviews_users.head()

In [22]:
#restaurants_reviews_users1 = restaurants_reviews_users.drop(['address', 'business_id', 'hours', 'is_open', 'postal_code', 'date', 'friends', 'yelping_since'], axis=1)

In [25]:
#reducing file size further by cropping out additional columns for parquet to pass, as issues with RAM cause kernel to restart
restaurants_reviews_users2 = restaurants_reviews_users.drop(['address', 'hours', 'is_open', 'postal_code', 'date', 
                                                             'friends', 'yelping_since','compliment_cool', 'compliment_cute', 'compliment_hot', 
                                                             'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 
                                                             'compliment_plain', 'compliment_profile'], axis=1)

In [None]:
# 'compliment_cool', 'compliment_cute', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_write',

In [None]:
#add the following code to run parquet, in terminal:
#conda install -c conda-forge fastparquet

In [26]:
from fastparquet import ParquetFile

In [None]:
restaurants_reviews_users2.to_parquet('s3://tally-ai-dspt3/yelp-kaggle-raw-data/restaurants_reviews_users.parquet.gzip', compression='gzip') 
             

In [None]:
#dd.to_csv(restaurants_reviews_users,'s3://tally-ai-dspt3/yelp-kaggle-raw-data/restaurants_reviews_users.csv')