### Import Statements

In [1]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Defining some global variables for use in all of my functions

In [6]:
global businessName
global businessType
global phoneNumber
global address
global driver

global GOT_Data
GOT_Data = pd.DataFrame(data={"Business Name":[], "Business Types": [],"Phone Number": [],"Address":[], "Latitude":[], "Longitude": []})

### Functions for navigating through pages

In [13]:
def openWindow(URL):
    driver.execute_script("window.open("+URL+");")


def closeWindow():
    driver.execute_script("window.close();") #might have to just make a second driver - xtraDriver and use xtraDriver.get

def nextPage(): #finds the html object for the next button at the bottom of the page and clicks it
    pageBlock = driver.find_element_by_class_name("pagination-block")
    results = pageBlock.find_elements_by_class_name("arrange_unit")
    results[-1].click() 


def isLastPage():
    pageBlock = driver.find_element_by_css_selector("div[class='search-pagination']")
    pageNumberText = pageBlock.find_element_by_class_name("page-of-pages").text
    pageNumberText = pageNumberText[5:]
    pageNumberText = pageNumberText.split(" of ")
    currentPage  = pageNumberText[0]
    lastPage = pageNumberText[1]
    if currentPage == lastPage:
        return True
    else:
        return False
    

def getInnerPageDetails():
    
    stars = driver.find_elements_by_class_name("i-stars").get_attribute("title")
    stars = stars.split(" ")
    stars = stars[0]
    
    reviewCount = driver.find_elements_by_class_name("review-count").text
    reviewCount = reviewCount.split(" ")
    reviewCount = reviewCount[0]



### Scraping Functions

In [12]:
def scrapeInnerPage(WebElement):
    #Get URL from element from 
    URL = WebElement.find_element_by_class_name("biz-name").get_attribute("href")
    #open WebElement in a new tab
    openWindow(URL)

    #BusinessName √
    #Rating √
    #Number_of_Reviews √
    #Address √
    #Coordinates √

#scrape the data from the page
def scrapePage():
    results = []
    results = driver.find_elements_by_css_selector("li[class='regular-search-result']")
    for result in results:
        
        businessTypeList = []
        businessTypeObj = result.find_element_by_css_selector("span[class='category-str-list']")
        businessTypes = businessTypeObj.find_elements_by_tag_name("a")
        for each in businessTypes:
            businessTypeList.append(each.text)
        businessType.append(tuple(businessTypeList)) #some businesses have multiple biz types
        nameGroup = result.find_element_by_class_name("biz-name")
        businessName.append(nameGroup.find_element_by_tag_name("span").text)
        latitude.append()
        try:
            phoneNumber.append(result.find_element_by_css_selector("span[class='biz-phone']").text)
        except:
            phoneNumber.append("N/A") # not every business has their phone number listed
            print("No Phone Number")
        try:
            justAddress = result.find_element_by_tag_name("address").text
            #append address to the name of the city so we have a full address of the business for later
            address.append(justAddress + ", " + result.find_element_by_css_selector("span[class='biz-city']").text)
        except:
            try:
                print('neighborhood') #if there is no neighborhood its probably a food truck and we can skip this data
                justAddress = result.find_element_by_tag_name("address").text
                address.append(justAddress + ", " + result.find_element_by_css_selector("span[class='neighborhood-str-list']").text)
            except:
                address.append("N/A")
                print("Error: Possible Food Truck")
        

def getCoordinates(URL_String): #gets coordinates from yelp page
    #Takes URL_String and splices it in order to get the coordinates.
    listOfURLParts = URL_String.split("&")
    for each in listOfURLParts:
        if "center" in each:
            coordinateString = each #center=40.711898%2C-74.017490
    coordinateString = coordinateString.replace("center=","") #40.711898%2C-74.017490
    geoList = coordinateString.split("%2C")
    geoPoint = {lat: geoList[0], long: geoList[1]}
    return geoPoint
        
def scrapeCity(location):
    #needs to take into account food trucks which have a distinct html object format
    
    businessName = []
    phoneNumber = []
    address = []
    businessType = []
    
    script = "arguments[0].setAttribute('value', '"+location+"' )"

    #find location text box
    try:
        searchBar = driver.find_element_by_xpath("//input[@id='search_location']")
    except:
        searchBar = driver.find_element_by_xpath("//input[@id='dropperText_Mast']")

    locationTextBox = driver.find_element_by_xpath("//input[@name='find_loc']")
    #changes location to desired location
    driver.execute_script(script, locationTextBox)
    searchBar.send_keys(Keys.ENTER)
    print("Attempting to Scrape")
    
    n = 1
    
    while (isLastPage() is False): #runs until we are on the last page of yelp search results for that U.S. city
        print(n) #just to see how long it takes to scrape one page
        scrapePage()
        nextPage()
        n+=1
    
    #export data- each page adds to a larger subset of the dataframe- this way I dont lose all my data if some new html comes out of nowhere
    df = pd.DataFrame(data={"Business Name":businessName, "Business Types": businessType,"Phone Number": phoneNumber,"Address":address})
    addToDataFrame(df)
    

def scrapeYelpFromCityList(textFile): #loops each location and scrapes the pages around it
    textFile = open(textFile,"r")
    for line in textFile.readlines():
        location = line.replace("\n","")
        print(location)
        scrapeCity(location)
        GOT_Data.to_csv(str(location)+'.csv')
        GOT_Data = pd.DataFrame(data={"Business Name":[], "Business Types": [],"Phone Number": [],"Address":[]})
        

In [11]:
def addToDataFrame(df): 
    GOT_Data = pd.concat([GOT_Data,df]).drop_duplicates().reset_index(drop=True) #drop duplicates because searching cities close by return the same businesses
    

### Main Run Script

In [None]:
driver = webdriver.Chrome(executable_path='/chromedriver')
driver.get("https://www.yelp.com/search?find_desc=&find_loc=Hoboken%2C+NJ&ns=1") #starting point for easy searching on yelp

scrapeYelpFromCityList("USCities.txt") #USCities.txt is a list created for all USCities