In [None]:
# INITIAL SETUP
# - create a new Folder named "scraping_folder"
# - download the file from the following link and upload it within the folder
# https://drive.google.com/file/d/1udLhxoDYEmc9_aFKLuRT-eER2X_-H_BT/view?usp=sharing

In [1]:
# this part of the code is only used to setup the environment to run the scraper
%%shell

# add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# prefer debian repo for chromium* packages only
# note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

Executing: /tmp/apt-key-gpghome.oD9Zm7fpl3/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.cJv01ON3Zx/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.3Ht8vgOtzC/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1




In [2]:
# install the necessary extra libraries to run the script
!apt-get update
!apt-get install chromium chromium-driver
!pip install selenium
!pip install webdriver-manager

0% [Working]            Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [                                                                               Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [                                                                               Get:3 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.36)] [                                                                               Get:4 http://deb.debian.org/debian-security buster/updates InRelease [34.8 kB]
0% [Waiting for headers] [Connecting to security.ubuntu.com (185

In [3]:
# import selenium, BeautifulSoup and other libraries to run the script
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from IPython.display import Image, display, clear_output
import re
import time

# define constants to define the maximum waiting time and maximum amount of reviews to collect
MAX_WAIT = 10
N_MAX = 20

In [4]:
# define a function to take a screenshot of the headless browser and display it
def take_screenshot(driver):
    driver.save_screenshot("/content/scraping_folder/pageImage.png")
    display(Image(filename="/content/scraping_folder/pageImage.png"))
    print()

In [5]:
# define a function to 
def expand_review(driver):
    # define an element to wait for an event to happen
    ex_wait = WebDriverWait(driver, MAX_WAIT)

    # load the complete review text in the HTML
    try:
        # wait until the element is clickable and when it is clickable, click it
        ex_wait.until(EC.element_to_be_clickable(
                    (By.XPATH,'//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/button'))
                  ).click()

        # wait complete reviews to load
        time.sleep(5)

    # raised if there is no link for expansion (e.g.: set of short reviews)
    # IMPORTANT: nothing will be printed to avoid dirty outputs
    except Exception as e:
        # print(e)
        print()

In [None]:
# function to parse a page of reviews
def get_review_data(resp, n_reviews, collected_data):

    take_screenshot(driver)
    
    # save place name along with review
    name = resp.find('h1', class_ = 'eIegw').text

    # collect the reviews
    r_list = resp.find('div', class_= 'LbPSX').find('div').contents

    # for each review
    for idx, review in enumerate(r_list):
        # ignore last element (next pages)
        if(idx == 10):
            break
            
        # get review date
        # NOTE: date is a string, it needs further processing to become a Date object
        if review.find('div', class_='RpeCd') is not None:
            # the field may contain other textual contents, hence we only take the first 8 characters
            review_date = review.find('div', class_='RpeCd').text[0:8]
        else:
            # some reviews may have no date
            review_date = None

        # get reviewer information: username, number of reviews, location
        info_text = review.find('div', class_='zpDvc')
        username = info_text.find('a', class_='BMQDV').text

        # collect the super-tag containing the location
        location_div = info_text.find_all('div', class_='JINyA')[0]

        # it can be noticed that there are two different cases by inspecting the page
        if location_div.find('div', class_='biGQs') is not None:

            # case 1 - look for a span without a class <div></div>
            if location_div.find('span', class_=None) is not None:
              location = location_div.find('span', class_=None).text

            # case 2 - look for a span with an empty class <div class=""></div>
            elif location_div.find('span', class_="") is not None:
              location = location_div.find('span', class_="").text
        else:
            # some reviews may have no location
            location = None

        # collect the tags with the info about the user who wrote the review
        info_elements = info_text.find_all('div', class_='JINyA')
        if len(info_elements) > 0:

            # case 1 - there's only one row 
            if len(info_elements) == 1:
              info_elem = info_elements[0].find('div', class_='biGQs')

              # subcase 1 - the row contains a div with a span with a specific class
              if info_elem.find('span', class_='IugUm') is not None:
                n_user_reviews = int(info_elem.find('span', class_='IugUm').text.replace('.', '').split(' ')[0])
                
              # subcase 2 - the row contains a div with a span with an empty class field
              elif info_elem.find('span', class_='') is not None:
                n_user_reviews = int(info_elem.find('span', class_='').text.replace('.', '').split(' ')[0])
            
            # case 2 - there are two rows
            elif len(info_elements) == 2:
              n_user_reviews = int(info_elements[1].find('div', class_='biGQs').text.replace('.', '').split(' ')[0])
        else:
            # some reviews may have no user review count
            n_user_reviews = None

        # get rating of review by looking at the aria-label of the svg with the dots
        rating_raw = review.find('svg', class_='UctUV')['aria-label']
        rating_review = float(rating_raw[10:11] + "." + rating_raw[12:13])

        # get review title
        title = review.find('div', class_='qWPrE').find('span', class_='yCeTE').text

        # get review complete text
        caption = review.find('div', class_='FKffI').find('span', class_='yCeTE').text

        # build review item
        item = {
            'id_review': n_reviews, # auto-increment
            'name': name,
            'title': title,
            'caption': caption,
            'date': review_date,
            'rating': rating_review,
            'username': username,
            'n_review_user': n_user_reviews,
            'location': location
        }

        # count the number of reviews collected
        n_reviews = n_reviews + 1

        # add the item to the array of reviews collected
        collected_data.append(item)
        
    time.sleep(5)
    clear_output(wait=True)

    # return the review count and the data collected
    return n_reviews, collected_data

In [None]:
# kill the chrome process to ensure a fresh start
!pkill chrome

# define the options to run Chrome
# IMPORTANT: these options are quite essential, especially when running it in a remote environment like Google Colab
options = webdriver.ChromeOptions()
# define the size of the window
options.add_argument("--window-size=1920,1080")
# disable notifications to avoid useless interactions 
options.add_argument("--disable-notifications")
# disable the developer options
options.add_argument('--disable-dev-shm-usage')
# set the browser to be headless (i.e., the browser window will be hidden)
options.add_argument('--headless')
# set the browser to run all scripts
options.add_argument('--no-sandbox')

driver = None

# read the links from the places.txt file
urls = open('/content/scraping_folder/places.txt')

# for each one of the urls retrieved
for url in urls:
    
    # the page has 2 layouts one of which contains the link in the re.search function, if the link is found, the page is reloaded until the other one is achieved
    # all of this is necessary because the structure of the two pages are completely different
    attempt = 0
    while True:
        if(driver is None):
            # initialize Selenium (Chrome) driver using the set options
            driver = webdriver.Chrome(options=options)
        
        # get the page with Selenium
        driver.get(url)

        # collect the content of the page
        src = driver.page_source
        # look for a specific text within the page source
        text_found = re.search(r'http://opengraphprotocol.org/schema/', src)
        
        # if the text is not found or the browser found it more than 10 times, break the cycle
        if(text_found is None or attempt > 10):
            break;
        
        attempt += 1

        # close and reset the driver
        driver.close()
        driver = None

    # if the page with the right structure isn't retrieved, the iteration moves to the next cycle
    if(attempt > 10):
        print("Incorret Layout - Impossible to obtain data")
        driver.close()
        continue
    
    # (re)set the variables for the current cycle
    collected_data = []
    n_reviews = 0

    # scroll and move to the filter section
    driver.execute_script('window.scrollBy(0,2500)')

    # expand the list of reviews (if needed)
    expand_review(driver)

    # wait for the page to be loaded
    WebDriverWait(driver, MAX_WAIT).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/div[1]/div/div/div[1]/div[1]/div[2]/span/a')))

    # send the page manipulated with Selenium to BeautifulSoup parser
    response = BeautifulSoup(driver.page_source, 'html.parser')

    # return the number of scraped reviews
    n_reviews, collected_data = get_review_data(response, n_reviews, collected_data)
    
    # create template for next pages urls
    # IMPORTANT: this templace will be used to change the displayed reviews (e.g., Reviews-or-30 displays the reviews from 20 to 30) 
    url = url.replace('Reviews-', 'Reviews-or{}-')

    # until enough reviews are collected
    while n_reviews < N_MAX:
        # Update the url by changing the {} with the number of collected reviews (e.g., 10, 20, ...)
        url_ = url.format(n_reviews)

        # apply the pipeline up to collect enough reviews
        driver.get(url_)
        
        # repeat the same process as before
        driver.execute_script('window.scrollBy(0,2500)')
        
        expand_review(driver)
        
        WebDriverWait(driver, MAX_WAIT).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/div[1]/div/div/div[1]/div[1]/div[2]/span/a')))
        
        response = BeautifulSoup(driver.page_source, 'html.parser')
        
        n_reviews, collected_data = get_review_data(response, n_reviews, collected_data)

    # print the list of collected elements
    for element in collected_data:
        print(element)

    time.sleep(5)
    clear_output(wait=True)
        
# close resources
urls.close()
driver.close()
driver.quit()

{'id_review': 0, 'name': 'Empire State Building', 'title': 'Esaltante', 'caption': "E' imperdibile la salita all'Empire State Building, ne vale la pena per il bellissimo panorama a  gradi su New York e anche per il piccolo museo interattivo sulla storia del grattacielo. Organizzazione ottima. Consigliato!", 'date': 'apr 2023', 'rating': 5.0, 'username': 'musicabarocca', 'n_review_user': 283, 'location': 'Savona, Italia'}
{'id_review': 1, 'name': 'Empire State Building', 'title': 'Imperdibile', 'caption': 'Da vedere assolutamente, esperienza mozzafiato in un luogo celebre in tutto il mondo. Vista imprendibile su New York. Un classico che non puo’ mancare in un soggiorno a Midtown Manhattan', 'date': None, 'rating': 5.0, 'username': 'wondertiz', 'n_review_user': 40, 'location': 'Lugano, Svizzera'}
{'id_review': 2, 'name': 'Empire State Building', 'title': 'Imperdibile', 'caption': 'Essendo una delle icone di NYC la visita è immancabile. Percorso obbligato bene organizzato, viene racconta