### Social Media Analytics
# Web scraping exercise

(c) Nuno António 2021-2025 - Version 2.17 (2025-04-18)

In [1]:
# Import the main libraries
from bs4 import BeautifulSoup           # to process html
import ssl                              # to process ssl certificates
import time                             # for time related functions

In [2]:
# Let's use Selenium to open a browser
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from fake_useragent import UserAgent

In [None]:
# Installations - If not installed
# Selenium: conda install -c conda-forge selenium
# Gecko driver: conda install -c conda-forge geckodriver

In [None]:
# If you don't have Firefox, please install it

# If you add any problem installing the GeckoDriver
# Link https://github.com/mozilla/geckodriver/releases/tag/v0.28.0
# Unzip and copy to a folder (e.g., the Anaconda3 folder)
# OR use "brew install geckodriver" 
# Copy the file path (MacOS right-click + Alt to get full pathname)
# Macs with Catalina must run: xattr -r -d com.apple.quarantine geckodriver
#    in the folder cd /Users/nunoantonio/anaconda3

In [3]:
# URL of web page
url = "https://www.tripadvisor.co.uk/Hotel_Review-g189158-d229324-Reviews-Sheraton_Lisboa_Hotel_Spa-Lisbon_Lisbon_District_Central_Portugal.html"

In [4]:
# Open the hotel web page and "inspect" the code

In [5]:
# Generate a random user-agent string
ua = UserAgent()
user_agent = ua.random

In [8]:
# Get Firefox options (configurations)
options = Options()

# Add options to fake agent
options.set_preference("general.useragent.override", user_agent)

# Add this argument to Options to hide Firefox (make it not visible)
# options.add_argument('--headless') 

# Open browser
browser = webdriver.Firefox(options=options)

In [9]:
# Get the page
browser.get(url) 
# Wait 4 seconds for page to load dynamic content
time.sleep(4)

In [10]:
# Read the content again and close de browser
html_source = browser.page_source  
browser.quit()

In [11]:
# Let's read the page again (after the click)
soup = BeautifulSoup(html_source) 

In [12]:
# Let's examine all the text inside. It's essentialy Java and what you see in the "inspect" of the webpage
print(soup.get_text())

SHERATON LISBOA HOTEL & SPA - Updated 2025 Reviews, Photos & PricesSkip to main contentDiscoverTripsReviewGBPSign inLisbonHotelsThings to DoRestaurantsFlightsHoliday RentalsCruisesCar HireForumsEuropePortugalCentral PortugalLisbon DistrictLisbonLisbon HotelsSheraton Lisboa Hotel & SpaPayments made by partners impact the order of prices displayed. Room types may vary.Learn moreThe listings of booking offers for each property take into account the compensation paid to us by our partners. Prices shown may be for varying room types. Prices displayed are the lowest available, corresponding to the least expensive room type available, as provided from our partners as of the time of the user’s search.Sheraton Lisboa Hotel & SpaSave4.34.3 of 5 bubbles(2,837 reviews) AI Reviews Summary #84 of 351 hotels in LisbonRua Latino Coelho, 1, Lisbon 1069 PortugalVisit hotel website21 312 0000Write a reviewCheck availability4 people are viewing this hotelMarriott Bonvoy member rate availableLog in to view

In [13]:
# Tags that are unique in the content can be accessed directly, for example the Title
soup.title

<title>SHERATON LISBOA HOTEL &amp; SPA - Updated 2025 Reviews, Photos &amp; Prices</title>

In [14]:
# If you want just the text of title add the "string" method
soup.title.string

'SHERATON LISBOA HOTEL & SPA - Updated 2025 Reviews, Photos & Prices'

In [15]:
# But we can be more specific
# Let's get a list of all images tags 
images = soup.find_all("img")
images

[,
 <img alt="Tripadvisor" class="XpHHt" src="https://static.tacdn.com/img2/brand_refresh/Tripadvisor_lockup_horizontal_secondary_registered.svg"/>,
 <img alt="Booking.com" class="WtVkd Vm" src="https://static.tacdn.com/img2/branding/hotels/Booking_Com_v2_384x164_Blue.png"/>,
 <img alt="Swimming Pool" fetchpriority="high" height="500" src="https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2e/d5/1a/ad/swimming-pool.jpg?w=900&amp;h=500&amp;s=1" srcset="https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2e/d5/1a/ad/swimming-pool.jpg?w=900&amp;h=500&amp;s=1 1x,https://dynamic-media-cdn.tripadvisor.com/media/photo-o/2e/d5/1a/ad/swimming-pool.jpg?w=1400&amp;h=800&amp;s=1 2x" width="900"/>,
 

In [17]:
# Every object has different properties that can accessed separately
# Name
images[0].name

'img'

In [18]:
# For example, the class
images[0]["style"]

'display:block;width:1px;height:1px'

In [19]:
# or the source URL of the image
images[0]["src"]

'data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACwAAAAAAQABAAACAkQBADs='

In [20]:
# You can use the "find" method to search for specific content
# Let's try to read the hotel page content
# Let's start with the "Total number of reviews"
# Open the inspect next to the "bubbles" and then see the tag and identifiable attributes
# <div class="biGQs _P pZUbB KxBGd" data-automation="bubbleReviewCount">
soup.find_all("div", {"data-automation": "bubbleReviewCount"})

[<div class="biGQs _P pZUbB KxBGd" data-automation="bubbleReviewCount">(2,837 reviews) </div>,
 <div class="biGQs _P pZUbB KxBGd" data-automation="bubbleReviewCount">(2,837 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(4)</div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(167 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(766 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(439 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(947 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(1,156 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(49 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(51 reviews) </div>,
 <div class="biGQs _P pZUbB hmDzD" data-automation="bubbleReviewCount">(55 re

In [21]:
# If we want to store it in a variable
nreviews = soup.find_all("div", {"data-automation": "bubbleReviewCount"})[0].get_text()
nreviews

'(2,837 reviews) '

In [22]:
# If you just want the result as integer
nreviews=int(''.join(filter(str.isdigit, nreviews)))
nreviews

2837

In [23]:
# What about the rating, how could be read it?

# <svg class="evwcZ"></svg>
# Let's divide the problem into parts

# 1st: Let's read something that contains with "evwcZ"
# You can use "select method" and the "*=" (that means "contains") - .findAll is also ok
r = soup.select("svg[class*=evwcZ]")
r

[<svg aria-labelledby=":lithium-r7r:" class="evwcZ" data-automation="bubbleRatingImage" height="16" viewbox="0 0 128 24" width="88"><title id=":lithium-r7r:">4.3 of 5 bubbles</title><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform=""></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(26 0)"></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(52 0)"></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(78 0)"></path><path d="M 12 0C5.389 0 0 5.389 0 12c0 6.62 5.389 12 12 12 6.62 0 12-5.379 12-12S18.621 0 12 0zm0 2a9.984 9.984 0 0110 10 9.976 9.976 0 01-10 10z" transform="translate(104 0)"></path></svg>,
 <svg aria-labelledby=":lithium-r7v:" class="evwcZ" data-automation="bubbleRatingImage" height="16" viewbox="0 0 128 24" width="88"><title id=

In [24]:
# There are many and similar results. Why? Let's see the page
# There are ratings per concepts, per other hotels, per review
# As such we should be more specific
# The main rating is inside the title element: <title id=" ...
r1=r[0].select("title")[0].string
r1

'4.3 of 5 bubbles'

In [25]:
# 2nd: There are two places with numbers, let's break the string into parts
parts = r1.split()
parts

['4.3', 'of', '5', 'bubbles']

In [26]:
# 3rd: Let's remove everything except the numbers from the lists
numbers = []
# Iterate over each part
for part in parts:
    try:
        num = float(part)
        numbers.append(num)
    except ValueError:
        # If conversion fails, continue to the next part
        continue
numbers

[4.3, 5.0]

In [27]:
# 4th: The first one is the rating
rating = numbers[0]
rating

4.3

In [28]:
# What about each review rating?

# We can see that there are reviews inside: <div class="" data-test-target="HR_CC_CARD"> which are inside in <div class="" data-test-target="reviews-tab">
reviewsContainer = soup.find("div", {"data-test-target": "reviews-tab"}) 
reviewsList = reviewsContainer.find_all("div", {"data-test-target": "HR_CC_CARD"})
reviewsList

[<div class="azLzJ MI R2 Gi z Z BB kYVoW tpnRZ" data-test-target="HR_CC_CARD"><div class="MD"><div class="bEBin _Y"><div class="iiEgb"><a class="BMQDV _F Gv wSSLS SwZTJ" href="/Profile/rrippy"><div><div class=""><div class="RvMjF ccudK Rb I o"><div class=""><div class="NhWcC _R o afQPz eXZKw" style="width: 40px; height: 40px;"></div></div></div></div></div></a></div><div class="w o"><div class="tFTbB"><div class="tVWyV _Z o S4 H3 Ci"><a class="BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS" href="/Profile/rrippy"><span class="CjfFL LJbhp">R. Rippy</span></a> wrote a review 8 Apr</div><span class="xLwBc S2 H2 Ch d">Wayne, Pennsylvania</span><span class="xLwBc S2 H2 Ch d"><span><span class="b Ch">243</span> contributions</span></span><span class="xLwBc S2 H2 Ch d"><span><span class="b Ch">121</span> helpful votes</span></span></div></div><div class="sapGY _Y F"><div class="overflow"><span class="GXoUa NF S8 _S MVUuZ"><svg aria-hidden="true" class="d Vb UmNoP" height="1em" viewbox="0 0 24 24" width=

In [29]:
# Let's find each review rating
# The rating is in the class "evwcZ" we saw before
# Let's see for each review
rr1 = reviewsList[0].select("svg[class*=evwcZ]")
rr1

[<svg aria-labelledby=":lithium-rdp:" class="evwcZ" data-automation="bubbleRatingImage" height="16" viewbox="0 0 128 24" width="88"><title id=":lithium-rdp:">5 of 5 bubbles</title><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform=""></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(26 0)"></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(52 0)"></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(78 0)"></path><path d="M 12 0C5.388 0 0 5.388 0 12s5.388 12 12 12 12-5.38 12-12c0-6.612-5.38-12-12-12z" transform="translate(104 0)"></path></svg>,
 <svg aria-labelledby=":lithium-rdq:" class="evwcZ" data-automation="bubbleRatingImage" height="16" viewbox="0 0 128 24" width="88"><title id=":lithium-rdq:">5 of 5 bubbles</title><path d="M 12 0C5.388

In [30]:
# If we do the same as before
rr2=rr1[0].select("title")[0].string

parts = rr2.split()
numbers = []
# Iterate over each part
for part in parts:
    try:
        num = float(part)
        numbers.append(num)
    except ValueError:
        # If conversion fails, continue to the next part
        continue
reviewRating=numbers[0]
reviewRating

5.0

In [31]:
# To read the user who posted the review, it's similar
# The tag is <span class="CjfFL" href="...">Name</span>
reviewsList[0].select("span[class*=CjfFL]")[0].string

'R. Rippy'

In [32]:
# QUESTION: And what about the full review text?
# Read the "span" inside <span class="orRIx">
q = reviewsList[0].select('span[class*="orRIx"]')[0]
q

<span class="orRIx Ci _a C" data-automation="reviewText_1001605926"><span>Excellent property in a good location. Above average fitness center. Superb staff. Room was real dark and quiet and well appointed. Very big lounge/restaraunt that was great for food and drinks and had live music on a weeknight which is impressive. I would definitely stay at this property again.</span></span>

In [33]:
# So, to read the text is just...
q.get_text()

'Excellent property in a good location. Above average fitness center. Superb staff. Room was real dark and quiet and well appointed. Very big lounge/restaraunt that was great for food and drinks and had live music on a weeknight which is impressive. I would definitely stay at this property again.'

In [34]:
# What if if we want to get all reviews' ratings and users?
# Let's do a loop and do the same code
for i in range(0,len(reviewsList)):
    # rating
    rr1 = reviewsList[i].select("svg[class*=evwcZ]")
    try:
        rr2=rr1[0].select("title")[0].string
        parts = rr2.split()
        numbers = []
        # Iterate over each part
        for part in parts:
            try:
                num = float(part)
                numbers.append(num)
            except ValueError:
                # If conversion fails, continue to the next part
                continue
        reviewRating=numbers[0]

        #user
        user = reviewsList[i].select("span[class*=CjfFL]")[0].string

        # text
        q = reviewsList[i].select('span[class*="orRIx"]')[0]
        reviewText = q.get_text()

        # print
        print(user, '\n', reviewRating, '\n' ,reviewText, '\n\n')
    except:
        print("not valid card")
    

R. Rippy 
 5.0 
 Excellent property in a good location. Above average fitness center. Superb staff. Room was real dark and quiet and well appointed. Very big lounge/restaraunt that was great for food and drinks and had live music on a weeknight which is impressive. I would definitely stay at this property again. 


Dan K 
 5.0 
 Well this was by far the best Sheraton I have ever stayed in.  What a pleasant surprise and a great way to end a long vacation.What made this hotel so special?  Well the staff, from the porter, to the check-in desk to the personnel working the lounge, all were noteworthy for how pleasant, helpful, competent and warm spirited they all were.The hotel rooms were really nice, comfortable and our room had a beautiful view of downtown/the bay area.The hotel rooftop/highest floor provided a stunning view of Lisbon, the water, bridge, sunset etc.  The area was also comfortable, provided swinging chairs and coffee/water.There is a spa service at the hotel as well as a h