In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 

import time
import re

import pandas as pd

In [2]:
# loads all the branches in side panel by scrolling
def load_branches(driver, action, bank):
    # finds the side panel of results
    result_title = driver.find_element(By.CLASS_NAME, "hfpxzc")
    action = ActionChains(driver)
    action.move_to_element(result_title)

    # scrolls down to load more results 
    end_of_list = False
    while not end_of_list:
        try:
            end_of_list = driver.find_element(By.CLASS_NAME, "HlvSq")
        except:
            pass
        action.send_keys(Keys.PAGE_DOWN).perform() 
        time.sleep(1)
    
    # return the branches
    if bank == 'Bank of America':
        full_service = driver.find_elements(By.XPATH, f'//a[@aria-label="Bank of America (with Drive-thru ATM)"]')
        lobby_only = driver.find_elements(By.XPATH, f'//a[@aria-label="Bank of America (Lobby Service Only)"]')
        return full_service + lobby_only
    else:
        return driver.find_elements(By.XPATH, f'//a[@aria-label="{bank}"]')


# find address and parse
def get_address(driver):
    address = driver.find_element(By.XPATH, "//div[@class='Io6YTe fontBodyMedium kR99db ']").text
    return address

# gets the number of total reviews for the branch
def get_number_reviews(driver):
    # gets the number of reviews and returns integer
    time.sleep(2)
    n_reviews = driver.find_elements(By.XPATH, "//div[@class='fontBodySmall']")[2]
    n_reviews = n_reviews.text
    n_reviews = re.search(r'\d+', n_reviews)
    return int(n_reviews[0])


# find and open "Reviews" tab
def reviews_tab(driver):
    # finds tabs
    tabs = driver.find_elements(By.XPATH, "//button[@class='hh2c6 ']")

    # opens Reviews tab
    tabs[0].click()

    
def load_all_reviews(driver, n_reviews):
    # gets number of reviews loaded
    reviews = driver.find_elements(By.XPATH, "//div[@class='jftiEf fontBodyMedium ']")
    
    # gets empty space to click so it can page_down correctly
    empty = driver.find_element(By.XPATH, "//div[@class='cVwbnc IlRKB']")
    empty.click()
    
    # pages down while num of reviews loaded is less than total reviews
    while len(reviews) != n_reviews:
        action.send_keys(Keys.PAGE_DOWN).perform()
        reviews = driver.find_elements(By.XPATH, "//div[@class='jftiEf fontBodyMedium ']")
        time.sleep(0.5)

    
# expands all the review texts
def expand_reviews(driver):
    expand_buttons = driver.find_elements(By.XPATH, '//button[@class="w8nwRe kyuRq"]')
    for button in expand_buttons:
        button.click()
        

# gets all review text
def get_reviews(driver):
    # get all review text
    reviews = driver.find_elements(By.XPATH, "//div[@class='GHT2ce']")
    review_text = []
    for rev in reviews:
        try:
            review_text.append(rev.find_element(By.XPATH, "div/div[@class='MyEned']//span[@class='wiI7pd']").text)
        except:
            review_text.append(None)
    return review_text


# gets star ratings
def get_stars(driver):
    star_ratings = driver.find_elements(By.XPATH, "//span[@class='kvMYJc']")
    star_ratings = [stars.get_attribute('aria-label') for stars in star_ratings]
    star_ratings = [re.search(r'\d+', stars)[0] for stars in star_ratings]
    star_ratings = [int(stars) for stars in star_ratings]
    return star_ratings


# gets dates of review
def get_dates(driver):
    dates = driver.find_elements(By.XPATH, "//span[@class='rsqaWe']")
    dates = [date.text for date in dates]
    return dates

In [3]:
banks = ['Frost Bank', 'Wells Fargo Bank', 'Chase Bank', 'Bank of America']
metros = ['Austin', 'San Antonio', 'Dallas', 'Houston']

# banks = ['Bank of America']
# metros = ['Dallas']

branch_id = 1
review_id = 1
branches_list = []
reviews_list = []

In [4]:
driver = webdriver.Chrome()
action = ActionChains(driver)

In [5]:
for metro in metros:
    for bank in banks:
        driver.get(f"https://www.google.com/maps/search/'{bank}'+{metro}/")
        time.sleep(5)
        
        # scrolls to load all branches
        branches = load_branches(driver, action, bank)
        
        # gets link to each individual branch
        branch_links = []
        for branch in branches:
            branch_links.append(branch.get_attribute('href'))
        
        # load branch page
        for link in branch_links:
            driver.get(link)

            # get the address information of branch
            address = get_address(driver)

            # click on the reviews tab
            reviews_tab(driver)

            # get the number of reviews
            # sometimes a branch has no reviews, in which case it raises an error
            # try/except is needed to circumvent
            time.sleep(1)
            try:
                n_reviews = get_number_reviews(driver)
                # scrolls until all reviews are loaded
                load_all_reviews(driver, n_reviews)

                # expand all review text by clicking all "More..." buttons
                time.sleep(2)
                expand_reviews(driver)

                # get all the review text for branch
                reviews_text = get_reviews(driver)

                # get star ratings
                star_ratings = get_stars(driver)

                # get dates
                dates = get_dates(driver)

                for date, stars, review in zip(dates, star_ratings, reviews_text):
                    reviews_list.append({
                        'review_id': review_id,
                        'branch_id': branch_id,
                        'date': date,
                        'stars': stars,
                        'review': review
                    })
                    review_id += 1

                branches_list.append({
                    'branch_id': branch_id,
                    'bank': bank,
                    'address': address,
                    'metro': metro
                })
                branch_id +=1
            except:
                pass


In [6]:
reviews_df = pd.DataFrame(reviews_list)
branches_df = pd.DataFrame(branches_list)

In [7]:
reviews_df.to_csv('../../raw_data/reviews_raw.csv', index=False)
branches_df.to_csv('../../raw_data/branches_raw.csv', index=False)

In [9]:
# (?<=, )\D+, \D{2}.+ extracts zip and city
# (.*)(?=,\D+,\D{2}.+) extracts address