In [1]:
import os
import boto3
import codecs
import json
import time
import datetime

#### Define constants

In [2]:
bucket = 'yummy-nlp'
data_dir = os.path.join('data')
review_filepath = os.path.join(data_dir, 'review.json')
business_filepath = os.path.join(data_dir, 'business.json')
restaurants_filepath = os.path.join(data_dir, 'restaurants.json')
review_txt_filepath = os.path.join(data_dir, 'review_txt_az.txt')
review_txt50_filepath = os.path.join(data_dir, 'review_txt_az50.txt')

### Function to Download file from an S3 bucket
Function takes **3** inputs:
- `bucket` name of the S3 bucket to download from
- `s3_filename` name of the file inside the bucket. For example to download `s3://tasty-nlp/data/review.json`, set `s3_filename` to _data/review.json_
- `local_filename` name of the file locally. To download file into a sub-directory prefix filename with the sub-directory name. For example, if you want to download it to `data` sub-directory, set `local_filename` to `'data/review.json'`

**Note:** Directory must exists before downloading the file

In [3]:
def download_s3_file(bucket, s3_filename, local_filename):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    my_bucket.download_file(s3_filename, local_filename)

### Download Business json file
This file contains information regarding a business. Lets download it from `s3://yummy-nlp/business.json` and save it locally in `data` directory

In [4]:
s3_filename = 'business.json'
local_filename = 'data/business.json'

download_s3_file(bucket, s3_filename, local_filename)
print("s3://{}/{} file downloaded successfully: {}"
      .format(bucket, s3_filename, os.path.exists(local_filename)))

s3://yummy-nlp/business.json file downloaded successfully: True


#### Read the first business record to see what information it has

In [5]:
with codecs.open(business_filepath, encoding='utf-8') as f:
    first_rec = f.readline()

print(first_rec)

{"business_id":"1SWheh84yJXfytovILXOAQ","name":"Arizona Biltmore Golf Club","address":"2818 E Camino Acequia Drive","city":"Phoenix","state":"AZ","postal_code":"85016","latitude":33.5221425,"longitude":-112.0184807,"stars":3.0,"review_count":5,"is_open":0,"attributes":{"GoodForKids":"False"},"categories":"Golf, Active Life","hours":null}



### Download Review json file
This file contains information regarding a review. Lets download it from `s3://yummy-nlp/review.json` and save it locally in `data` directory

**Note:** This file is **5GB** in size

In [6]:
%%time
s3_filename = 'review.json'
local_filename = 'data/review.json'

download_s3_file(bucket, s3_filename, local_filename)
print("s3://{}/{} file downloaded successfully: {}"
      .format(bucket, s3_filename, os.path.exists(local_filename)))

s3://yummy-nlp/review.json file downloaded successfully: True
CPU times: user 30.2 s, sys: 27.6 s, total: 57.8 s
Wall time: 4min 51s


#### Read the first review record to see what information it has

In [7]:
with codecs.open(review_filepath, encoding='utf-8') as f:
    first_rec = f.readline()

print(first_rec)

{"review_id":"Q1sbwvVQXV2734tPgoKj4Q","user_id":"hG7b0MtEbXx5QzbzE6C_VA","business_id":"ujmEBvifdJM6h6RLv4wQIg","stars":1.0,"useful":6,"funny":1,"cool":0,"text":"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.","date":"2013-05-07 04:34:36"}



### Create restaurants.json
From `business.json` filter out information for all restaurants and create `restaurants.json`. We can then use this file to filter further based on _state_ or _stars_ etc. This file will contain information related to restaurants only.

1. Read in each business record and convert it to python dict
2. Filter out business records that aren't about restaurants
3. Create a set of restaurant ids

In [8]:
%%time
if 1 == 1:
    restaurant_ids = set()

    with codecs.open(restaurants_filepath, 'w', encoding='utf-8') as restaurants_file:
        with codecs.open(business_filepath, encoding='utf-8') as f:
            for business_json in f:
                #convert the json record to a python dict
                business = json.loads(business_json)

                # if category is present and is of type Restaurants
                if business[u'categories'] and u'Restaurants' in business[u'categories']:
                    restaurants_file.write(business_json)
                    restaurant_ids.add(business[u'business_id'])

    print("{} restaurants in the dataset".format(len(restaurant_ids)))

59371 restaurants in the dataset
CPU times: user 4.99 s, sys: 319 ms, total: 5.31 s
Wall time: 5.35 s


### Find restaurants in a particular state (AZ)

In [9]:
%%time
az_restaurant_ids = set()

with codecs.open(restaurants_filepath, encoding='utf-8') as f:
    for restaurant_json in f:
        #convert the json record to a python dict
        restaurant = json.loads(restaurant_json)
        
        if restaurant[u'state'].upper() == 'AZ':
            az_restaurant_ids.add(restaurant[u'business_id'])

print("{} restaurants in AZ".format(len(az_restaurant_ids)))

11465 restaurants in AZ
CPU times: user 1.79 s, sys: 38 ms, total: 1.83 s
Wall time: 1.83 s


### Get reviews of restaurants in AZ
From `reviews.json` file read each review, check if the restaurant id of the review is in **az_restaurant_ids** set that was created in the previous step. If the review's restaurant id is in the list then, write the review text to `review_txt_az.txt` file.

In [10]:
%%time
review_txt_filepath = os.path.join(data_dir, 'review_txt_az.txt')

if 1 == 1:
    review_count = 0

    with codecs.open(review_txt_filepath, 'w', encoding='utf-8') as review_txt_file:
        with codecs.open(review_filepath, encoding='utf-8') as review_json_file:
            for review_json in review_json_file:
                review = json.loads(review_json)

                if review[u'business_id'] in az_restaurant_ids:
                    review_txt_file.write(review[u'text'].replace('\n', '').strip())
                    review_txt_file.write('\n')
                    review_count += 1

    print("Text from {} restaurant reviews written to the text file {}".format(review_count, review_txt_filepath))

Text from 1191691 restaurant reviews written to the text file data/review_txt_az.txt
CPU times: user 2min 4s, sys: 5.64 s, total: 2min 10s
Wall time: 2min 12s


### Restaurants in AZ, with 50 or more reviews

In [11]:
%%time
az50_restaurant_ids = set()

with codecs.open(restaurants_filepath, encoding='utf-8') as f:
    for restaurant_json in f:
        #convert the json record to a python dict
        restaurant = json.loads(restaurant_json)
        
        if restaurant[u'state'].upper() == 'AZ' and restaurant[u'review_count'] >= 50:
            az50_restaurant_ids.add(restaurant[u'business_id'])

print("{} restaurants in AZ with 50 or more reviews".format(len(az50_restaurant_ids)))

5216 restaurants in AZ with 50 or more reviews
CPU times: user 1.72 s, sys: 53.9 ms, total: 1.78 s
Wall time: 1.79 s


In [12]:
%%time
review_txt_filepath = os.path.join(data_dir, 'review_txt_az50.txt')

if 1 == 1:
    review_count = 0

    with codecs.open(review_txt_filepath, 'w', encoding='utf-8') as review_txt_file:
        with codecs.open(review_filepath, encoding='utf-8') as review_json_file:
            for review_json in review_json_file:
                review = json.loads(review_json)

                if review[u'business_id'] in az50_restaurant_ids:
                    review_txt_file.write(review[u'text'].replace('\n', '').strip())
                    review_txt_file.write('\n')
                    review_count += 1

    print("Text from {} restaurant reviews written to the text file {}".format(review_count, review_txt_filepath))

Text from 1075198 restaurant reviews written to the text file data/review_txt_az50.txt
CPU times: user 2min 13s, sys: 7.37 s, total: 2min 20s
Wall time: 2min 22s
