In [1]:
# INITIAL SETUP
# - create a new Folder named "scraping_folder"

In [2]:
# this part of the code is only used to setup the environment to run the scraper
%%shell

# add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# prefer debian repo for chromium* packages only
# note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

Executing: /tmp/apt-key-gpghome.TOPejJ5uWp/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.m6OwgYJCwi/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.XBHiVzQD9k/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1




In [3]:
# install the necessary extra libraries to run the script
!apt-get update
!apt-get install chromium chromium-driver
!pip install selenium
!pip install webdriver-manager

0% [Working]            Get:1 http://deb.debian.org/debian buster InRelease [122 kB]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub                                                                               Get:2 http://deb.debian.org/debian buster-updates InRelease [56.6 kB]
                                                                               Get:3 http://deb.debian.org/debian-security buster/updates InRelease [34.8 kB]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub                                                                               Get:5 https://developer.download.n

In [4]:
# import selenium, BeautifulSoup and other libraries to run the script
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from IPython.display import Image, display, clear_output
import re
import time

# define constants to define the maximum waiting time and maximum amount of reviews to collect
MAX_WAIT = 20
N_MAX = 20

In [5]:
# define a function to take a screenshot of the headless browser and display it
def take_screenshot(driver):
    driver.save_screenshot("/content/scraping_folder/pageImage.png")
    display(Image(filename="/content/scraping_folder/pageImage.png"))
    print()

In [6]:
# define a function to take a screenshot of the headless browser and display it, then clean the outcome after a few seconds
def take_screenshot_and_clear(driver):
    driver.save_screenshot("/content/scraping_folder/pageImage.png")
    display(Image(filename="/content/scraping_folder/pageImage.png"))
    
    time.sleep(3)
    clear_output(wait=True)

In [7]:
# define a function to get the data from the scraped place
def get_place_data(driver, response, wait):

    take_screenshot(driver)
    
    # prepare a dictionary to store results
    place = {}

    # get the place's name
    name = response.find('h1', class_ = 'eIegw').text
    
    place['name'] = name

    # get the number of reviews
    num_reviews = response.find('span', class_="KAVFZ").text

    # cast the number of reviews to the correct type
    num_reviews = int(num_reviews.split(' ')[0].replace('.', ''))
    place['review'] = num_reviews 
    
    # get the overall rating
    overall_rating = response.find_all('div', class_="kUaIL")[0].find('div', class_='GOdjs')['aria-label'][10:13].replace(',', '.')
    place['rating'] = overall_rating

    # get the place's address, if it exists
    if response.find('div', class_='wgNTK') is not None:
        complete_address = response.find('div', class_='wgNTK').find_all("span", class_='biGQs')[0].text
    else:
        complete_address = ""
    
    place['address'] = complete_address
    
    # get ranking
    ranking_string = response.find_all('div', class_='kUaIL')[1].find("div", class_='KxBGd').text
    
    # cast the ranking to the correct types and compute the absolute and relative ranking
    absolute_rank = int(ranking_string.split(' ')[1])
    ranking_length = float(ranking_string.split(' ')[3].replace('.', ''))

    place['ranking_str'] = ranking_string
    place['ranking_abs'] = absolute_rank
    place['ranking_rel'] = float(absolute_rank)/ranking_length
    
    # get the tag list
    tags = response.find_all('div', class_='kUaIL')[2].find("div", class_='fIrGe').text.split(' • ')

    place['tags'] = tags
    
    # return the list of places
    return place

In [11]:
# kill the chrome process to ensure a fresh start
!pkill chrome

# define the options to run Chrome
# IMPORTANT: these options are quite essential, especially when running it in a remote environment like Google Colab
options = webdriver.ChromeOptions()
# define the size of the window
options.add_argument("--window-size=1920,1080")
# disable notifications to avoid useless interactions 
options.add_argument("--disable-notifications")
# disable the developer options
options.add_argument('--disable-dev-shm-usage')
# set the browser to be headless (i.e., the browser window will be hidden)
options.add_argument('--headless')
# set the browser to run all scripts
options.add_argument('--no-sandbox')

# initialize Selenium (Chrome) driver using the set options
driver = webdriver.Chrome(options=options)

# get tripadvisor's main page
webpage = 'https://www.tripadvisor.it'
driver.get(webpage)

# define an element to wait until an event occurs
wait = WebDriverWait(driver, MAX_WAIT)

# wait until the search bar is located on the screen
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'icwvJ'))).find_element(By.CLASS_NAME, 'qjfqs').click()

take_screenshot_and_clear(driver)

# wait until the privacy banner is clickable and click it, if it exists
try:
    if driver.find_element(By.ID,'onetrust-accept-btn-handler') is not None:
      wait.until(EC.element_to_be_clickable((By.ID,'onetrust-accept-btn-handler'))).click()
except Exception as e:
    print()

# define a string query
query = 'Milano'

# find the search bar on the scren
search_bar = driver.find_element(By.CLASS_NAME, 'icwvJ').find_element(By.CLASS_NAME, 'qjfqs')

# add the query string content to the search bar and then press enter
search_bar.send_keys(query)
search_bar.send_keys(Keys.RETURN)

# wait for the banner with the results to be loaded
wait.until(
	EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.search-results-list'))
	)

take_screenshot_and_clear(driver)

# specify the category of elements to found (e.g., restaurants, hotels, places, etc.)
subfilter = 'Cose da fare'

# wait for the element with the specified text to be clickable and click it
field_bt = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, subfilter))).click()

# wait for search results to load
wait.until(
	EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.search-results-list'))
	)

take_screenshot_and_clear(driver)

# send the page manipulated with Selenium to BeautifulSoup parser
response = BeautifulSoup(driver.page_source, 'html.parser')

# get the results and store url
results_list = response.find_all('div', class_='result-title')

# close the driver
driver.close()
driver.quit()

# for each place found
for elem in results_list:
    # get the element containing its informations
    features = elem['onclick'].split(',')
    
    # extract their url, type and id
    url = webpage + features[3].lstrip()[1:-1]
    elem_type = features[4].split(': ')[1][1:-1]
    locationId = int(features[8].split(': ')[1][1:-1])

    # (re)set the driver
    driver_new_page = None
    
    # the page has 2 layouts one of which contains the link in the re.search function, if the link is found, the page is reloaded until the other one is achieved
    # all of this is necessary because the structure of the two pages are completely different
    attempt = 0
    while True:
        if(driver_new_page is None):
            # initialize Selenium (Chrome) driver using the set options
            driver_new_page = webdriver.Chrome(options=options)
        
        # get the page with Selenium
        driver_new_page.get(url)

        # collect the content of the page
        src = driver_new_page.page_source
        # look for a specific text within the page source
        text_found = re.search(r'http://opengraphprotocol.org/schema/', src)
        
        # if the text is not found or the browser found it more than 10 times, break the cycle
        if(text_found is None or attempt > 10):
            break;
        
        attempt += 1

        # close and reset the driver
        driver_new_page.close()
        driver_new_page.quit()
        driver_new_page = None

    # if the page with the right structure isn't retrieved, the iteration moves to the next cycle
    if(attempt > 10):
        print("Incorret Layout - Impossible to obtain data")
        driver_new_page.close()
        driver_new_page.quit()
        continue
    
    # send the page manipulated with Selenium to BeautifulSoup parser
    resp = BeautifulSoup(driver_new_page.page_source, 'html.parser')

    # scrape place data
    place_data = get_place_data(driver_new_page, resp, wait)
    
    # print the collected data
    print(place_data)
    
    time.sleep(5)
    clear_output(wait=True)
    
    # close the driver
    driver_new_page.close()
    driver_new_page.quit()

https://www.tripadvisor.it/Attraction_Review-g187849-d202624-Reviews-Piazza_del_Duomo-Milan_Lombardy.html
0


KeyboardInterrupt: ignored