# Data Collector Package
Designed to collect data needed for Austin Drinks

---

Imports and Constants:
* Imports the necessary packages and modules

Utility Methods:
* Methods for saving and loading data, logging errors, cleaning strings, and other miscellaneous methods

Gather Establishment Information Methods:
* Methods designed to work with yelp-specific website layout

Web Driver Method:
* Method used to create BeautifulSoup object and returns object to the caller

Data Collecting Methods:
* Higher-level methods that utilize other package methods to complete a task or sequence of tasks


---


# TODO:
---
General:
* Clean up code to make consistent and add/remove comments where necessary
---
Utility Methods:
---
Gather Establishment Information Methods:
* Extract Method refactor code so that individual components of information is gathered in their own methods
---
Web Driver Method:
* add code using proxies to speed up the data collection process and avoid being flagged
---
Data Collecting Method:
---
Google:
* start probing Google to see how to scrape data
---

# Imports and Constants

In [None]:
from selenium.webdriver.support.ui import WebDriverWait       
from selenium.webdriver.common.by import By       
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime as dt
import csv
import io
import json
import os
import random
import re
import time
SITES = None
URL_LIST_FILENAMES = []
CITY_JSON_FILENAMES = []
ROOT = None

# Utility Methods

Methods for saving and loading data, logging errors, and other miscellaneous tasks

---
* __add_filename_to_url_list
* __add_filename_to_city_json_list
* get_url_list_filenames
* get_city_json_filenames
* set_constants
* __errorLog (typically used by other methods to log errors during the web scraping process)
* save_city_csv
* reformat_json
* save_city_json
* load_city_json
* save_url_list
* load_url_list
* clean_string
---

In [None]:
def __add_filename_to_url_list(filename):
  global URL_LIST_FILENAMES
  if filename not in URL_LIST_FILENAMES:
    URL_LIST_FILENAMES.append(filename)

In [None]:
def __add_filename_to_city_json_list(filename):
  global CITY_JSON_FILENAMES
  if filename not in CITY_JSON_FILENAMES:
    CITY_JSON_FILENAMES.append(filename)

In [None]:
def get_url_list_filenames():
  global URL_LIST_FILENAMES
  return URL_LIST_FILENAMES

In [None]:
def get_city_json_filenames():
  global CITY_JSON_FILENAMES
  return CITY_JSON_FILENAMES

In [None]:
def set_constants(dic, root):
  global ROOT
  ROOT = root
  print('root dir: ' + ROOT)
  global SITES
  SITES = dic
  print(SITES)
  global URL_LIST_FILENAMES
  URL_LIST_FILENAMES = os.listdir(root + '/Data/search_result_links')
  print(URL_LIST_FILENAMES)
  global CITY_JSON_FILENAMES
  CITY_JSON_FILENAMES = os.listdir(root + '/Data/json_by_city')
  print(CITY_JSON_FILENAMES)

In [None]:
def __errorLog(link, message):
  '''
  # log errors as they occur in collection process
  # all errors are appended at the end with a date, time, and error message
  '''
  global ROOT
  with open(ROOT + '/Data/error_log.txt', 'a', encoding='utf-8') as f:
    f.write('{}\t\t{}\n\t{}\n\n'.format(dt.now(), link, message))

In [None]:
def save_city_csv(filename, data, mode = 'w'):
  '''
  # save collected data for a city to csv
  '''
  global ROOT
  if not filename.endswith('.csv'):
    filename += '.csv'
  with open(ROOT + '/Data/csv_by_city/' + filename, mode, encoding='utf=8') as f:
    header = ['name', 'address', 'address', 'city', 'state', 'postal_code', 
                  'telephone', 'price', 'rating_value', 'review_count', 'status', 
                  'longitude', 'latitude', 'website', 'tags', 'days', 
                  'hours', 'accommodations', 'unaccommodations']
    writer = csv.writer(f)
    writer.writerow(header)
    for row in data:
      writer.writerow(row)

In [None]:
def reformat_json(data):
  '''
  # reformat json data to a preferred readable format
  '''
  new_data = {}
  new_data['name'] = data['name']
  new_data['open_for_business'] = data['open_for_business']
  new_data['tags'] = data['tags']
  new_data['website'] = data['website']
  new_data['longitude'] = data['longitude']
  new_data['latitude'] = data['latitude']
  new_data['address'] = data['address']
  new_data['postal_code'] = data['postal_code']
  new_data['city'] = data['city']
  new_data['state'] = data['state']
  new_data['telephone'] = data['telephone']
  new_data['price'] = data['price']
  new_data['hours'] = data['hours']
  new_data['accommodations'] = data['accommodations']
  new_data['unaccommodations'] = data['unaccommodations']
  new_data['rating_value'] = data['rating_value']
  new_data['review_count'] = data['review_count']
  new_data['reviews'] = data['reviews']
  return new_data

In [None]:
def save_city_json(filename, data):
  '''
  # save organized JSON data collected from site
  '''
  global ROOT
  if not filename.endswith('.json'):
    filename += '.json'
  with io.open(ROOT + '/Data/json_by_city/' + filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii = False, indent = 4)
  print('city json saved to: ' + ROOT + '/Data/json_by_city/' + filename)
  __add_filename_to_city_json_list(filename)

In [None]:
def load_city_json(filename):  
  '''
  # load and return JSON data from file
  '''
  global ROOT
  with io.open(ROOT + '/Data/json_by_city/' + filename, 'r', encoding='utf-8') as f:
    data = json.load(f)
  return data

In [None]:
def save_url_list(filename, data, mode = 'w'):
  '''
  # Saves a list of urls to a text file to reduce web scraping time
  # allows the users to use their own naming scheme on the file
  '''
  global ROOT
  if not filename.endswith('.txt'):
    filename += '.txt'
  with open(ROOT + '/Data/search_result_links/' + filename, mode, encoding='utf-8') as f:
    for url in data:
      f.write(url + '\n')
  print('URLs saved to: ' + ROOT + '/Data/search_result_links/' + filename)
  __add_filename_to_url_list(filename)

In [None]:
def load_url_list(filename):
  '''
  # read from a text file the list of urls saved and store them in list(data)
  '''
  global ROOT
  if not filename.endswith('.txt'):
    filename += '.txt'
  data = []
  with open(ROOT + '/Data/search_result_links/' + filename, 'r', encoding='utf-8') as f:
    for line in f.readlines():
      line = re.sub('\n', '', line)
      data.append(line)
  return data

In [None]:
def clean_string(string):
  '''
  # clean up the string of unwanted characters
  '''
  cleaned = re.sub('&amp;', 'and', string)
  cleaned = re.sub('&apos;', '\'', cleaned)
  cleaned = re.sub('\n', '', cleaned)
  return cleaned

In [None]:
def clean_review_string(string):
  '''
  # clean up the string of unwanted characters
  '''
  cleaned = re.sub('&amp;', 'and', string)
  cleaned = re.sub('&apos;', '\'', cleaned)
  cleaned = re.sub('&quot;', '"', cleaned)
  cleaned = re.sub('\n', ' ', cleaned)
  return cleaned

# Gather Establishment Information Methods

Methods that focus on accommodating individual website's layouts to obtain establishment information

---
* __get_value_in_SITES
* __get_value_from_json
* __get_yelp_est_info
---

In [None]:
def __get_value_in_SITES(site, key):
  '''
  # Access SITES dictionary given site and key values
  # returns None if not in dictionary
  '''
  global SITES
  if site in SITES and key in SITES[site]:
    return SITES[site][key]
  else:
    return None

In [None]:
def __get_value_from_json(data, first_layer_key, second_layer_key = None):
  # check if first_key is in data
  if first_layer_key in data:
    # check if second_layer_key and if second_key is in data[first_layer_key]
    if second_layer_key is not None and second_layer_key in data[first_layer_key]:
      if isinstance(data[first_layer_key][second_layer_key], str):
        return clean_string(data[first_layer_key][second_layer_key])
      else:
        return data[first_layer_key][second_layer_key]
    # if no second_layer_key, return value for first_layer_key in data
    else:
      if isinstance(data[first_layer_key], str):
        return clean_string(data[first_layer_key])
      else:
        return data[first_layer_key]
  return ''

In [None]:
def __get_yelp_est_info(site, link):
  '''
  # Method specifically tailored to yelp for collecting data from an establishment's page
  '''
  global SITES, PERMANENTLY_CLOSED, OPEN_FOR_BUSINESS
  siteJSON = None
  infoJSON = {}
  infoCSV = []  
  html = getBS(link, site)
  if html is not None:

    # get script tags of interest
    # pretty standard for script tags of interest
    tags = html.find_all('script', {'type' : 'application/ld+json'}) 

    # which tag in the list of tags do we need?
    # whichever contains 'name' in the first set of keys for the json.
    
    for tag in tags:
      if 'name' in json.loads(''.join(tag.contents)):
        siteJSON = json.loads(''.join(tag.contents))
        break
        
    # extract information in the JSON
    if siteJSON is not None:

      # name of establishment
      infoJSON['name'] = __get_value_from_json(siteJSON, 'name')
      if not infoJSON['name']:
        infoJSON['name'] = clean_string(html.find(SITES[site]['establishment_name_tag'], SITES[site]['establishment_name_tag_attr_and_def']).text)
      infoCSV.append(infoJSON['name'])
      print('Collecting information for ' + infoJSON['name'])

      # address
      infoJSON['address'] = __get_value_from_json(siteJSON, 'address', 'streetAddress')
      if 'Get Directions' in infoJSON['address']:
        infoJSON['address'] = ''
      infoCSV.append(infoJSON['address'])

      # city
      infoJSON['city'] = __get_value_from_json(siteJSON, 'address', 'addressLocality')
      infoCSV.append(infoJSON['city'])

      # state
      infoJSON['state'] = __get_value_from_json(siteJSON, 'address', 'addressRegion')
      infoCSV.append(infoJSON['state'])

      # postal code
      infoJSON['postal_code'] = __get_value_from_json(siteJSON, 'address', 'postalCode')
      infoCSV.append(infoJSON['postal_code'])
      
      # phone number
      infoJSON['telephone'] = __get_value_from_json(siteJSON, 'telephone')
      infoCSV.append(infoJSON['telephone'])

      # dollar signs/price range
      infoJSON['price'] = __get_value_from_json(siteJSON, 'priceRange')
      infoCSV.append(infoJSON['price'])

      # review rating
      infoJSON['rating_value'] = __get_value_from_json(siteJSON, 'aggregateRating', 'ratingValue')
      infoCSV.append(infoJSON['rating_value'])

      # number of reviews
      infoJSON['review_count'] = __get_value_from_json(siteJSON, 'aggregateRating', 'reviewCount')
      infoCSV.append(infoJSON['review_count'])

      # reviews on establishment
      infoJSON['reviews'] = __get_value_from_json(siteJSON, 'review')

      # for some reason in Yelp's json data 
      # each reviewer's rating value is nested into another dictionary by itself
      # let's pull it out and have one less layer deep to go for it..
      for review in infoJSON['reviews']:
        value = review['reviewRating']['ratingValue']
        review['reviewRating'] = value
        review['description'] = clean_review_string(review['description'])

      # menu or cuisine, if available?
      infoJSON['cuisine'] = __get_value_from_json(siteJSON, 'servesCuisine')

      # end json data extraction

#    else:
      # extract information from the page itself
### TODO: EXTRACT INFORMATION FROM THE PAGE ITSELF IF JSON CANNOT BE FOUND
    # extracted data not in available json

    # check if the establishment is permanently closed
    if SITES[site]['status_text'] in html.text.lower():
      infoJSON['open_for_business'] = 0
    else:
      infoJSON['open_for_business'] = 1
    infoCSV.append(infoJSON['open_for_business'])

    # map location
    tag = html.find(SITES[site]['location_tag'], SITES[site]['location_tag_attr_and_def'])
    longitude = 0.0
    latitude = 0.0
    if tag is not None and tag.find('img')['src']:
      string = tag.find('img')['src']
      # %2C is the divider between longitude and latitude
      # there are multiple coordinates in this string
      # and the correct coordinates follow .png towards the end of the string.
      # layout: .png%7C00.000000%2C-00.000000&
      # .png%7C[]%2C[]&
      regex = re.search(SITES['yelp']['location_string_start'] + '+[A-Z0-9%.-]+' + SITES['yelp']['location_string_end'], string)

      if regex is not None:
        regex_str = regex.group(0)
        regex_str = re.sub(SITES['yelp']['location_string_start'], '', regex_str)
        regex_str = re.sub(SITES['yelp']['location_string_end'], '', regex_str)
        long_lat = regex_str.split(SITES['yelp']['location_string_delimiter'])
        longitude = long_lat[0]
        latitude = long_lat[1]
    infoJSON['longitude'] = float(longitude)
    infoJSON['latitude'] = float(latitude)
    infoCSV.append(longitude)
    infoCSV.append(latitude)

    # website
    # <div class = ' css-1vhakgw border--top__373c0__19Owr border-color--default__373c0__2oFDT'>
      # <a>
    tag = html.find(SITES[site]['business_url_tag'], SITES[site]['business_url_tag_attr_and_def'])
    website = ''
    if tag is not None and tag.find('a'):
      website = tag.find('a').get_text()
    infoJSON['website'] = website
    infoCSV.append(website)
      
    # est. type.
    tags = html.find_all(SITES[site]['est_tags_tag'], SITES[site]['est_tags_tag_attr_and_def'])
    biz_tags = []
    tag_str = ''
    for tag in tags:
      anc_tags = tag.find_all('a')
      if anc_tags:
        for anc in anc_tags:
          if anc.get_text() != 'Unclaimed':
            tag_str += anc.get_text() + '|'    # cannot use commas as a delimiter because we are currently saving to csv format.
            biz_tags.append(anc.get_text())

    infoJSON['tags'] = biz_tags      
    infoCSV.append(tag_str)

    # hours of operations, if available  
      # this is specifically for yelp
      # <table class = ' hours-table__373c0__1S9Q_ table__373c0__1paZL table--simple__373c0__3hEOO' ...>
      # yes, there is a space at the front
        # <tr class = ' table-row__373c0__3xT3x' ...>
          # <th class = ' table-header-cell__373c0__OywTx' ...> gets the day of the week
          # <td class = ' table-cell__373c0__Hc_7A table-cell--top__373c0__1hgmO' ...> gets the hours of the day
          # and td of same class above also gets for current day "Open", "Closed now", etc.. if text is present, blank otherwise.

    days_hours = {}
    day = ''
    hours = ''
    table = html.find(SITES[site]['hoO_tag'], SITES[site]['hoO_tag_attr_and_def'])
    if table is not None and table.find('tbody').contents:
      table_row = table.find_all('tr', SITES[site]['hoO_row_tag_attr_and_def'])
  #    arr = []
      for row in table_row:
        day_str = ''
        hours_str = ''
        # get th and td
        if row.find('th'):
          day_str = row.find('th').get_text()
          day += day_str + '|'
        else:
          day += '|'
        if row.find('td'):
          hours_str = row.find('td').get_text()
          hours_str = re.sub('(Next day)', '', hours_str)
          hours += hours_str + '|'
        else:
          hours += '|'        
        days_hours[day_str] = hours_str
  #      a = []
  #      a.append(day)   # 1 column
  #      a.append(hours)   # 1 column, only getting the first of two.
  #      arr.append(a)  
  #    data = pd.DataFrame(arr, columns = ['Day', 'Hours'])
  #    print(data)
    else:
#        print('No hours table is available for ' + name) 
      # empty columns for 'day' and 'hours'.
      day = '||||||'
      hours = '||||||'

    # even if days_hours is empty, append it to the json file.
    infoJSON['hours'] = days_hours
    infoCSV.append(day)
    infoCSV.append(hours)

    # Amenities, if it exists.
    accommodation_list = []
    unaccommodation_list = []
    accomm_str = ''
    unaccom_str = ''
    if SITES[site]['amenities_header_text'] in html.text:
      # what the company accommodates
      amenities_tags = html.find_all(SITES[site]['amenities_tag'], SITES[site]['amenities_tag_attr_and_def'])
      # what the company does not accommodate
      anti_amenities_tags = html.find_all(SITES[site]['anti_amenities_tag'], SITES[site]['anti_amenities_tag_attr_and_def'])
      accommodation_list = [tag.text for tag in amenities_tags]
      unaccommodation_list = [tag.text for tag in anti_amenities_tags]
      for tag in amenities_tags:
        accomm_str += tag.text + '|'
      for tag in anti_amenities_tags:
        unaccom_str += tag.text + '|'
    infoJSON['accommodations'] = accommodation_list
    infoJSON['unaccommodations'] = unaccommodation_list
    infoCSV.append(accomm_str)
    infoCSV.append(unaccom_str)
      # about the business?


  else:
    print('Error getting BeautifulSoup object for ' + url_curr)
    __errorLog(url_curr, 'Error getting BeautifulSoup object in __get_yelp_est_info')

  # reformat the json structure for readability before returning the object
  return infoCSV, reformat_json(infoJSON)
  # end method

# Web Driver Creates BeautifulSoup Object Method
Method to create beautifulsoup object and returns object to the caller

---
* __getBS_yelp_button_press
* getBS
* get_search_result_links
---

In [None]:
def __getBS_yelp_button_press(driver, url, source, site):
  '''
  # Method utilized to press the Attributes button to reveal all Attributes for an establishment
  '''
  global SITES
  
  # if we are interested in clicking a button,
  # we repeat the process using the existing driver to click the button
  attempts = 0
  max_attempts = 2
  unsuccessful = True 
  while unsuccessful:
    attempts += 1
    try:
      # this aria-controls is unique to the button we are looking for..
#             buttons = source.find_all('button', {'class': ' css-174jypu'})  # find buttons with class of interest
      buttons = source.find_all('button', {SITES[site]['amenities_reveal_button_tag_attr'] : SITES[site]['amenities_reveal_button_tag_attr_def']})
      for button in buttons:
        if SITES[site]['amenities_reveal_button_text'] in button.text:
          driver.execute_script(
              "arguments[0].click();", 
              WebDriverWait(driver, 20).until(EC.element_to_be_clickable(
                  (By.CSS_SELECTOR, 
                   'button[' + SITES[site]['amenities_reveal_button_hider_attr'] + '=' + button[SITES[site]['amenities_reveal_button_hider_attr']] + ']'))))
          time.sleep(random.randint(3,6))
          break
    except BaseException as ex:
      if attempts < max_attempts:
        print('Unable to click button. Retrying in 30 seconds. \
              See exception repr for details: ')
        print(repr(ex))      # print object as string
        time.sleep(30)
      else:
        print('Too many attempts to click button, breaking loop')
        #log error to file.
        __errorLog(url, 'Exceeded click button request limit to extract Amenities in getBS_yelp_button_press\n' + repr(ex))
        break
    else:
      unsuccessful = False
      new_source = BeautifulSoup(driver.page_source, 'html.parser')   
       
  return new_source


In [None]:
def getBS(url, site = None):
  '''
  # Selenium web driver is used to get a webpage's html and parse it to a BeautifulSoup object
  '''
  global SITES

  chrome_options = Options()
  chrome_options.add_argument('--headless') # don't spawn window
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  chrome_options.add_argument('user-agent = Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36')
  driver = webdriver.Chrome(executable_path = '/usr/lib/chromium-browser/chromedriver', options = chrome_options)
  print('Building BS object at url @ ' + url)

  # first, get the page where the button we want is at
  source = None
  attempts = 0
  max_attempts = 4
  unsuccessful = True 
  while unsuccessful:
    attempts += 1
    try:
      driver.get(url)
      time.sleep(random.randint(10,14))
    except BaseException as ex:
      if attempts < max_attempts:
        print('Unable to open url. Retrying in 30 seconds. \
               See exception repr for details: ')
        print(repr(ex))      # print object as string
        time.sleep(30)
      else:
        print('Too many attepts, breaking loop')
        #log error to file.
        __errorLog(url, 'Too many attempts\n' + repr(ex))
        break
    else:
      unsuccessful = False
      source = BeautifulSoup(driver.page_source, 'html.parser')
    
      # second, if we are interested in clicking a button,
      # we repeat the process using the existing driver to click the button
      if site == 'yelp' in url and SITES[site]['amenities_header_text'] in source.text:
        source = __getBS_yelp_button_press(driver, url, source, site)

  driver.close()
  driver.quit()
  return source

In [None]:
# NOTE: this function works for yelp

def get_search_result_links(url_curr, lst, site):
  '''
  # fills lst with a list of complete url's to each establishment's review site
  # methods in place to skip potential anchor tags.
  '''
  
  global SITES
  print('Gathering search result urls @ ' + url_curr)
  bs = getBS(url_curr)
  if bs:
    tags = bs.find_all(SITES[site]['search_results_tags'], SITES[site]['get_search_results_ext_tag_and_attribute'])
    for tag in tags:
      # check if advertisement is expected in search results based on SITES
      # and skip this anchor tag if it contains advertisement string in the href
      if 'ad_characters' in SITES[site].keys() and SITES[site]['ad_characters'] in tag.find('a')['href']:
        continue
      # if this tag is not an advertisement, add it to the list of urls
      else:
        # check if the href contains the full url or extension only
        url_ext = tag.find('a')['href']
        if 'remove_unwanted_characters_in_url' in SITES[site]:
          url_ext = re.sub(SITES[site]['remove_unwanted_characters_in_url'], '', url_ext)
        if SITES[site]['url'] in url_ext:
          lst.append(url_ext)
        else:
          lst.append(SITES[site]['url'] + url_ext)

    # get the next page anchor on current page, then go to the next page recursively until there are no more pages.
    next_page = bs.find('a', SITES[site]['next_page_tag_attr_and_def'])
    if next_page != None:
      get_search_result_links(next_page['href'], lst, site)    
  else:
    print('Error getting BeautifulSoup object for ' + url_curr)
    __errorLog(url_curr, 'Error getting BeautifulSoup object in get_search_result_links')

  # end method

# Data Collecting Methods

These methods will be used by the user to access other methods above.
While methods above can still be used by the user, these are nicely packaged methods that will call methods sequentially to collect data from yelp, google, etc..

---
Methods:
* collect_city_establishments_information
* collect_establishment_information
* get_search_result_url_list
---

In [None]:
def collect_city_establishments_information(site, city = 'austin', state_abbr = 'TX'):
  '''
  # collects information 
  '''
  global SITES, URL_LIST_FILENAMES

  city = re.sub(' ', '_', city)
  state_abbr = state_abbr.upper()

  # get three file names, one for brewery, one for distillery, and one for winery
  filenames = []
  for filename in URL_LIST_FILENAMES:
    if filename.startswith(city + '_' + state_abbr):
      filenames.append(filename)

  url_list = []
  for filename in filenames:
    urls_from_file = load_url_list(filename)
    for url in urls_from_file:
      if url not in url_list:
        url_list.append(url)

  csv_mat = []
  json_list = []
  # retrieves yelp establishment information
  if site == 'yelp':
    for url in url_list:
      # receive csvlst and jsondict for one establishment
      csv_list, json_dict = __get_yelp_est_info(site, url)
      csv_mat.append(csv_list)

      # add json_dict for current establishment to the json_list
      json_list.append(json_dict)


  # end collecting, save json_list to json file.
  save_city_json(city + '_' + state_abbr + '.json', json_list)

  # end collecting, save csv_mat to csv file
  save_city_csv(city + '_' + state_abbr + '.json', csv_mat)

In [None]:
def collect_establishment_information(site, link):
  '''
  # returns 'infoCSV' and 'infoJSON' with data for the chosen 'site' being scraped
  # this method is called once for each establishment
  # NOTE: this function works for yelp
  # is this function worth keeping?  for one-offs perhaps?
  '''
  print('Attempting to collect from ' + link)
  
  # retrieves yelp establishment information
  if site == 'yelp':  
    return __get_yelp_est_info(site, link)

In [None]:
def get_search_result_url_list(site = 'yelp', city = 'austin', state_abbr = 'TX', est_type = 'distilleries'):
  '''
  # returns a list of urls for all establishments in a search result based on SITES dictionary
  '''
  global SITES, ROOT
  url_list = []
  link = None
  if site == 'yelp':
    state_abbr = state_abbr.upper()
    city = re.sub('[ _]', '+', city) # replace spaces with plus signs for url text
    link = 'https://www.yelp.com/search?find_desc=' + est_type + '&find_loc=' + city + '%2C+' + state_abbr
  get_search_result_links(link, url_list, site)    # this function is recursively called and returns nothing but populates url_list
  city = re.sub('[+]', '_', city) # replace plus sign with underscore for filenaming
  save_url_list(city + '_' + state_abbr + '_' + est_type +'.txt', url_list)
  return url_list