# Scraping data from YELP to create a New York City restaurant database
### 1. Importing packages

In [53]:
import json
import requests
import configparser
import os
import time
import logging

## 2. Taking credentials from configuration file, and make them environment variables:

In [54]:
config = configparser.ConfigParser()
config.read('./credentials/dwh.cfg')
os.environ['YELP_CLIENT_ID'] = config.get('YELP', 'YELP_CLIENT_ID')
os.environ['YELP_API_KEY'] = config.get('YELP', 'YELP_API_KEY')

## 3. Initialize the variables. Url of the API, the headers, category to search for, and the search objects, which in this case will be districts of NYC

In [55]:
 yelp_url = 'https://api.yelp.com/v3/businesses/search'
 headers = {'Authorization': 'Bearer {}'.format(os.environ['YELP_API_KEY'])}
 category = 'Restaurants'
 districts  = [ 'Midtown West'#, 
                'Greenwich Village',
                'East Harlem',
                'Upper East Side',
                'Midtown East',
                'Gramercy',
                'Little Italy',
                'Chinatown',
                'SoHo',
                'Harlem',
                'Upper West Side',
                'Tribeca',
                'Garment District',
                'Stuyvesant Town',
                'Financial District',
                'Chelsea',
                'Morningside Heights',
                'Times Square',
                'Murray Hill',
                'East Village',
                'Lower East Side',
                'Hells Kitchen',
                'Central Park'
                ]

## 4. Running the crawler, we will save the responses locally to later upload manually in S3

In [59]:
run_time = int(time.time()) # To know when we ran the crawler. It'll be the name of the destination folder.
for district in range(len(districts)):
    logging.info('-------------------\n Gathering data for {} \n ---------------------\n'.format(districts[district]))

    for step in range(50):
        location = districts[district] + ', Manhattan, NY'
        term = 'Restaurants'
        search_limit = 20
        offset = step*search_limit
        categories = '(restaurants, ALL)'
        sort_by = 'best_match'

        url_params = {
                         'location': location.replace(' ', '+'),
                         'term' : term,
                         'limit': search_limit,
                         'offset': offset,
                         'categories': categories,
                         'sorty_by': sort_by
                     }
       
        response = requests.get(yelp_url, headers=headers, params=url_params)
        
        try:
            response.json()["businesses"]
        except KeyError:
            break # The API is limiting the amount of results returning a json without the key: businesses

        logging.info(('***** {} Restaurants #{} - #{} ....{}'.format(districts[district], 
                                                              offset+1, offset+search_limit,
                                                              response)))
        
        filename = 'restaurants-{}-{}.json'.format(districts[district].replace(" ", "-"), step)
        directory = './data/{}/{}/'.format(str(run_time),districts[district].replace(" ", "-"))

        os.makedirs(directory, exist_ok=True)

        try:
            data=response.json()
        except KeyError:
            continue
        # This is in order to create a JSONL file, which can be read with a jsonpath file
        with open(directory+filename, 'w') as data_object:
            for record in range(len(data['businesses'])):
                json.dump(data['businesses'][record], data_object)
                if record!=len(data['businesses'])-1:
                    data_object.write('\n')
                

        
logging.info(response)
logging.info(type(response.text))
logging.info(response.json().keys())
logging.info(response.text[:1000])



## To check one entry:

In [57]:
data['businesses'][0]

{'id': 'skN-mgnOezFQr9YNh5G5TQ',
 'alias': 'lolitas-kitchen-new-york',
 'name': "Lolita's Kitchen",
 'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/D9LOW5MsId59pGuEBqvk3Q/o.jpg',
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/lolitas-kitchen-new-york?adjust_creative=57uLU1O-AjUbFSI7t6x7lw&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=57uLU1O-AjUbFSI7t6x7lw',
 'review_count': 109,
 'categories': [{'alias': 'mexican', 'title': 'Mexican'},
  {'alias': 'burgers', 'title': 'Burgers'}],
 'rating': 4.0,
 'coordinates': {'latitude': 40.7824207446896, 'longitude': -73.9534502033954},
 'transactions': ['pickup', 'delivery'],
 'price': '$',
 'location': {'address1': '1364 Lexington Ave',
  'address2': '',
  'address3': '',
  'city': 'New York',
  'zip_code': '10128',
  'country': 'US',
  'state': 'NY',
  'display_address': ['1364 Lexington Ave', 'New York, NY 10128']},
 'phone': '+12127225055',
 'display_phone': '(212) 722-5055',
 'distance': 1010.7100795492

0