In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
import csv
import pandas as pd

In [2]:
MAX_WAIT = 10
MAX_RETRY = 10
MAX_SCROLLS = 40
HEADER = ['id_review', 'review', 'retrieval_date', 'relative_date', 'rating', 'username', 'adress']

In [3]:
class GoogleMaps:

    def __init__(self, csv):
        folder = "data/"
        file = "all_reviews_orange.csv"
        self.targetfile = open(folder + file, mode='w', encoding='utf-8', newline='\n')
        
        self.df = pd.read_csv(csv, sep = ';')
        
        self.writer = self.__get_writer(HEADER)
        self.driver = self.__get_driver()
        
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, tb):
        print('Closing chromedriver...')
        self.driver.close()
        self.driver.quit()
        self.targetfile.close()
    
    def __get_writer(self, header):
        writer = csv.writer(self.targetfile, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(header)
        return writer
    
    def __get_driver(self, debug=True):
        options = Options()
        if not debug:
            options.add_argument("--headless")
        options.add_argument("--window-size=1366,768")
        options.add_argument("--disable-notifications")
        options.add_argument("--lang=en")
        input_driver = webdriver.Chrome(options=options)
        return input_driver
    
    def get_reviews(self, all_reviews = True):
        
        # iteration over urls
        for i in range(len(self.df.urls)):
            self.adress = self.df.adress[i]
            url = self.df.urls[i]
            
            self.driver.get(url)
            wait = WebDriverWait(self.driver, MAX_WAIT)

            # order reviews by date
            menu_bt = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.goog-inline-block.section-dropdown-menu-button-caption')))

            # get number of reviews
            if all_reviews:
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                number_of_reviews = soup.find("div", {"class":"gm2-caption"})
                N = int(re.sub("\D", "",number_of_reviews.string))

            # sometimes problem in loading the event on this button
            clicked = False
            tries = 0
            while not clicked and tries < MAX_RETRY:
                try:
                    menu_bt.click()
                    # second element of the list: most recent
                    recent_rating_bt = self.driver.find_elements_by_xpath('//div[@role=\'option\']')[2]
                    recent_rating_bt.click()

                    clicked = True

                    # wait to load review (ajax call)
                    time.sleep(5)

                except:
                    tries += 1
                    print('Warning: failed to click recent button')

            # failed to change the filter
            if tries == MAX_RETRY:
                return -1


            n_reviews_loaded = len(self.driver.find_elements_by_xpath('//div[@class=\'section-review-content\']'))
            n_scrolls = 0
            while n_reviews_loaded < N and n_scrolls < MAX_SCROLLS:

                # scroll to load more reviews
                scrollable_div = self.driver.find_element_by_css_selector(
                    'div.section-layout.section-scrollbox.scrollable-y.scrollable-show')
                self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)

                # wait for other reviews to load (ajax)
                time.sleep(4)

                # expand review text
                self.__expand_reviews()

                n_reviews_loaded = len(self.driver.find_elements_by_xpath('//div[@class=\'section-review-content\']'))

                n_scrolls += 1

            response = BeautifulSoup(self.driver.page_source, 'html.parser')
            reviews = response.find_all('div', class_='section-review-content')
            
            n_reviews = 0
            for idx, review in enumerate(reviews):
                n_reviews += self.__parse_reviews(review)
            
            print('Scraped reviews: ', n_reviews)
            

    def __parse_reviews(self, review):

        item = {}

        id_review = review.find('button', class_='section-review-action-menu')['data-review-id']
        username = review.find('div', class_='section-review-title').find('span').text

        try:
            review_text = self.__filter_string(review.find('span', class_='section-review-text').text)
        except:
            review_text = None

        rating = review.find('span', class_='section-review-stars')['aria-label'].split(' ')[1]
        relative_date = review.find('span', class_='section-review-publish-date').text

        item['id_review'] = id_review
        item['review'] = review_text

        # depends on language, which depends on geolocation defined by Google Maps
        item['relative_date'] = relative_date

        # store datetime of scraping and apply further processing to calculate
        # correct date as retrieval_date - time(relative_date)
        item['retrieval_date'] = datetime.now()
        item['rating'] = rating
        item['username'] = username
        item['adress'] = self.adress
        
        self.writer.writerow(list(item.values()))

        return 1
    
    # expand review description
    def __expand_reviews(self):
        # use XPath to load complete reviews
        links = self.driver.find_elements_by_xpath('//button[contains(@class, "section-expand-review blue-link")]')
        for l in links:
            l.click()
        time.sleep(2)
        
    # util function to clean special characters
    def __filter_string(self, str):
        strOut = str.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
        
        return strOut

In [4]:
with GoogleMaps('urls_boutiques_orange.csv') as scraper:
    scraper.get_reviews()

Scraped reviews:  410
Scraped reviews:  159
Scraped reviews:  140
Scraped reviews:  0
Scraped reviews:  149
Scraped reviews:  91
Scraped reviews:  100
Scraped reviews:  101
Scraped reviews:  139
Scraped reviews:  197
Scraped reviews:  100
Scraped reviews:  287
Scraped reviews:  307
Scraped reviews:  46
Scraped reviews:  106
Scraped reviews:  188
Scraped reviews:  120
Scraped reviews:  140
Scraped reviews:  30
Scraped reviews:  140
Closing chromedriver...
