# Web parsing with Python, Beautiful Soup and Selenium

### 1. Get any HTML

It's very easy to extract the source code of a web page in Python

In [7]:
import requests

In [3]:
# a very lightweight website
url = 'https://lite.cnn.com/en'

In [4]:
# Let's render it here (I love Jupyter)
from IPython.display import IFrame
IFrame(src=url, width='100%', height='250ps')

In [8]:
answer = requests.get(url)

In [9]:
# what could we do with an answer
print(answer.url)
print(answer.status_code)
print(answer.reason)

https://lite.cnn.com/en
200
OK


In [10]:
print(answer.content)



**That looks like a lot of things. We have to somehow navigate through HTML**

### 2. Use BS

In [12]:
from bs4 import BeautifulSoup

In [13]:
soup = BeautifulSoup(answer.content, 'html.parser')

In [14]:
# now we can recognize some structure
print(soup.prettify())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="/static/main.650c7b8fbdce6a28161cf1157d8d5800.css" rel="stylesheet" type="text/css"/>
  <title data-react-helmet="true">
   CNN - Breaking News, Latest News and Videos
  </title>
  <meta content="View the latest news and breaking news today for U.S., world, weather, entertainment, politics and health at CNN.com." data-react-helmet="true" name="description"/>
 </head>
 <body>
  <div id="mount">
   <div>
    <div style="padding:10px;">
     <a href="/es">
      <strong style="color:#c00;">
       CNN
      </strong>
     </a>
     |
     <span>
      5/7/2022
     </span>
     |
     <a href="/en/audioplayer">
      Listen
     </a>
    </div>
    <hr/>
    <div class="afe4286c">
     <strong>
      Main Stories
     </strong>
     <ul>
      <li>
       <

In [15]:
soup.title

<title data-react-helmet="true">CNN - Breaking News, Latest News and Videos</title>

In [16]:
# let's find the links
soup.find_all('a')[:10]

[<a href="/es"><strong style="color:#c00;">CNN</strong></a>,
 <a href="/en/audioplayer">Listen</a>,
 <a href="/en/article/h_5f0abdd04d273724488ab299e0e0b835">Sailors say aircraft carrier that had multiple suicides occur among crew was uninhabitable</a>,
 <a href="/en/article/h_326e1c1bc0874ecee3926bc8f1a634be">More than 200 sailors moved off aircraft carrier after multiple suicides</a>,
 <a href="/en/article/h_d8f86ff27f835eed5c15bdbf80f64e50">Parents of sailor who died by suicide on USS George Washington blast Navy's 'ridiculous' response</a>,
 <a href="/en/article/h_c5fb1c3d2d41ec684f29d940ce033a99">Navy opens investigation after 4 deaths by suicide among aircraft carrier crew</a>,
 <a href="/en/article/h_79a4fbbc97d656c08f8e6cd0aa2c67a7">Authorities made a key discovery in their search for the missing Alabama officer and inmate. It led to more questions</a>,
 <a href="/en/article/h_a16bd1e151b99270554991c790ef47bc">Coronavirus wave this fall and winter could potentially infect 100 m

In [17]:
# and get the title of one
soup.find_all('a')[5].get_text()

'Navy opens investigation after 4 deaths by suicide among aircraft carrier crew'

Now to serious business!

### 3. Scrape Airbnb page

Let's get to the website and look for some apartments

In [38]:
# Let's plan a trip to Austrian Alps
airbnb_url = 'https://www.airbnb.ca/s/Canmore--Alberta--Canada/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=may&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Canmore%2C%20Alberta%2C%20Canada&place_id=ChIJMWNFlZXFcFMRLmkenl8xtkY&checkin=2022-05-20&checkout=2022-05-27&adults=2&source=structured_search_input_header&search_type=autocomplete_click'

In [39]:
soup = BeautifulSoup(requests.get(airbnb_url).content, 'html.parser')

In [40]:
print(soup.prettify())

<!DOCTYPE html>
<html data-is-hyperloop="true" dir="ltr" lang="en-CA">
 <meta charset="utf-8"/>
 <meta content="en-CA" name="locale"/>
 <meta content="notranslate" name="google"/>
 <meta content="authenticity_token" id="csrf-param-meta-tag" name="csrf-param"/>
 <meta content="" id="csrf-token-meta-tag" name="csrf-token"/>
 <meta content="" id="english-canonical-url"/>
 <meta content="on" name="twitter:widgets:csp"/>
 <meta content="yes" name="mobile-web-app-capable"/>
 <meta content="yes" name="apple-mobile-web-app-capable"/>
 <meta content="Airbnb" name="application-name"/>
 <meta content="Airbnb" name="apple-mobile-web-app-title"/>
 <meta content="#ffffff" name="theme-color"/>
 <meta content="#ffffff" name="msapplication-navbutton-color"/>
 <meta content="black-translucent" name="apple-mobile-web-app-status-bar-style"/>
 <meta content="/?utm_source=homescreen" name="msapplication-starturl"/>
 <script>
  (function() {
  var pgRequest = new XMLHttpRequest();
  var diffStamp = Date.now(

### 4. Inspect elements

Press F12 ;)

### 5. Scrape 1 element

In [44]:
soup.find_all('div', '_gig1e7')

[<div class="_gig1e7"><div class="c4mnd7m dir dir-ltr"><div class="c1pbo4kt cikyho8 dir dir-ltr" style="--transition-element_transition-delay:0ms;--transition-element_transition-duration:200ms"><div class="_11ry7lz" data-testid="shimmer-legacy-listing-section-item"><div class="_o1549l" style="margin-top:12px;margin-bottom:24px"></div><div class="c1k4n2ms dir dir-ltr"><div class="mertcch dir dir-ltr"><span aria-busy="true" style="display:block;height:100%;width:100%;background-color:white;border-radius:var(--i-g-gvoq)"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:var(--i-g-gvoq)"></span></span></div><div class="iabnva0 dir dir-ltr"><div style="margin-bottom:4px"><span aria-busy="true" style="display:block;height:18px;width:40%;background-color:white;border-radius:10px"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:10px"></span></span></div><span aria-busy="true" style="display:block;height

In [53]:
# we can also extract its child tag
soup.find_all('div', 'c4mnd7m')

[<div class="c4mnd7m dir dir-ltr"><div class="c1pbo4kt cikyho8 dir dir-ltr" style="--transition-element_transition-delay:0ms;--transition-element_transition-duration:200ms"><div class="_11ry7lz" data-testid="shimmer-legacy-listing-section-item"><div class="_o1549l" style="margin-top:12px;margin-bottom:24px"></div><div class="c1k4n2ms dir dir-ltr"><div class="mertcch dir dir-ltr"><span aria-busy="true" style="display:block;height:100%;width:100%;background-color:white;border-radius:var(--i-g-gvoq)"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:var(--i-g-gvoq)"></span></span></div><div class="iabnva0 dir dir-ltr"><div style="margin-bottom:4px"><span aria-busy="true" style="display:block;height:18px;width:40%;background-color:white;border-radius:10px"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:10px"></span></span></div><span aria-busy="true" style="display:block;height:24px;width:90%;backg

In [54]:
listings = soup.find_all('div', 'c4mnd7m')

In [55]:
listings[0]

<div class="c4mnd7m dir dir-ltr"><div class="c1pbo4kt cikyho8 dir dir-ltr" style="--transition-element_transition-delay:0ms;--transition-element_transition-duration:200ms"><div class="_11ry7lz" data-testid="shimmer-legacy-listing-section-item"><div class="_o1549l" style="margin-top:12px;margin-bottom:24px"></div><div class="c1k4n2ms dir dir-ltr"><div class="mertcch dir dir-ltr"><span aria-busy="true" style="display:block;height:100%;width:100%;background-color:white;border-radius:var(--i-g-gvoq)"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:var(--i-g-gvoq)"></span></span></div><div class="iabnva0 dir dir-ltr"><div style="margin-bottom:4px"><span aria-busy="true" style="display:block;height:18px;width:40%;background-color:white;border-radius:10px"><span class="_8kplbn" style="animation-play-state:running;height:100%;width:100%;border-radius:10px"></span></span></div><span aria-busy="true" style="display:block;height:24px;width:90%;backgr

In [56]:
listings[0].find_all('a')[0].get('href')

'/rooms/42088343?adults=2&check_in=2022-05-20&check_out=2022-05-27&previous_page_section_name=1000&federated_search_id=71748c10-400f-4f25-90be-9c1ade63dee2'

In [57]:
listings[0].get_text()

'SUPERHOSTEntire condo in Canmore1BR*stunning views*hot tubs*steam room*gym & more4 guests · 1 bedroom · 2 beds · 1 bathKitchen · WifiRare find,&nbsp;\xa0·\xa0This place is usually booked.Rare find,&nbsp;\xa0·\xa0This place is usually booked.4.88\xa0(94 reviews)$248 CAD\xa0night$248 CAD per night$1,736 CAD total$1,736 CAD totalView price breakdown'

### 6. Inspect all data elements on search page

**smithio.medium.com**

<img src='https://miro.medium.com/max/700/1*GLNHp0QOf5qZiHa1bnaRvg.png'>

In [50]:
# url: tag=a, get=href
# name: tag=div, class=_hxt6u1e, get=aria-label
# header: tag=div, class=_b14dlit

### 7. Write a scraping function

In [70]:
# First Generation :)
def extract_basic_features(listing_html):
    features_dict = {}
    
    url = listing_html.find('a').get('href')
    name = listing_html.find("div", {"class": "c1bx80b8"}).get_text()
    header = listing_html.find("div", {"class": "mj1p6c8"}).get_text()
    
    features_dict['url'] = url
    features_dict['name'] = name
    features_dict['header'] = header
    
    return features_dict

In [71]:
extract_basic_features(listings[0])

{'url': '/rooms/42088343?adults=2&check_in=2022-05-20&check_out=2022-05-27&previous_page_section_name=1000&federated_search_id=71748c10-400f-4f25-90be-9c1ade63dee2',
 'name': '1BR*stunning views*hot tubs*steam room*gym & more',
 'header': 'Entire condo in Canmore'}

In [None]:
# what if the tag is not found?
listings[0].find('b').get_text()

AttributeError: 'NoneType' object has no attribute 'get_text'

In [58]:
# Second Generation :)
def extract_basic_features(listing_html):
    features_dict = {}
    
    try:
        url = listing_html.find('b').get('href')
    except:
        url = 'empty'
    try:
        name = listing_html.find("div", {"class": "_hxt6u1e"}).get('aria-label')
    except:
        name = 'empty'
    try:
        header = listing_html.find("div", {"class": "_b14dlit"}).text
    except:
        header = 'empty'
    
    
    features_dict['url'] = url
    features_dict['name'] = name
    features_dict['header'] = header
    
    return features_dict

In [59]:
extract_basic_features(listings[0])

{'url': 'empty', 'name': 'empty', 'header': 'empty'}

In [None]:
# too many separate extractions
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'div', 'class': '_hxt6u1e', 'get': 'aria-label'},
    'header': {'tag': 'div', 'class': '_b14dlit'},
    'rooms': {'tag': 'div', 'class': '_kqh46o'},
    'facilities': {'tag': 'div', 'class': '_kqh46o', 'order': 1},
    'badge': {'tag': 'div', 'class': '_17bkx6k'},
    'rating_n_reviews': {'tag': 'span', 'class': '_18khxk1'},
    'price': {'tag': 'span', 'class': '_1p7iugi'},
    'superhost': {'tag': 'div', 'class': '_ufoy4t'},
}

In [None]:
# Third Generation :)
def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [None]:
print(extract_element(listings[0], RULES_SEARCH_PAGE['header']))
print(extract_element(listings[0], RULES_SEARCH_PAGE['url']))

In [None]:
for feature in RULES_SEARCH_PAGE:
    print(f"{feature}: {extract_element(listings[0], RULES_SEARCH_PAGE[feature])}")

In [None]:
for feature in RULES_SEARCH_PAGE:
    try:
        print(f"{feature}: {extract_element(listings[0], RULES_SEARCH_PAGE[feature])}")
    except:
        print(f"{feature}: empty")

YAY!!! We're extracted all the features from one listing!

### 8. Explore pagination

<img src='https://miro.medium.com/max/564/1*Q9iBSu5nniBwc8Wt2-8Ujw.png'>

In [None]:
airbnb_url

In [None]:
# let's finally write this function
def get_listings(search_page):
    soup = BeautifulSoup(requests.get(search_page).content, 'html.parser')
    listings = soup.find_all('div', '_8s3ctt')

    return listings

In [None]:
# it works
len(get_listings(airbnb_url))

In [None]:
# let's try next page
new_url = airbnb_url + '&items_offset=20'
len(get_listings(new_url))

In [None]:
# checking the content, if the data is there
print(extract_element(get_listings(airbnb_url)[0], RULES_SEARCH_PAGE['name']))
print(extract_element(get_listings(new_url)[0], RULES_SEARCH_PAGE['name']))

### 9. Collect all urls

In [None]:
# let's iterate through all 15 pages
all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)
    
    # let's check if it's scraping
    print(len(all_listings))

In [None]:
# why? maybe Airbnb tries to prevent scraping
# let's wait a couple of seconds after every iteration
import time

all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}&section_offset=3'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)
    
    # let's check if it's scraping
    print(len(all_listings))

    time.sleep(2)

Not perfect but some improvement

In [None]:
# another random check, if the data is there
print(extract_element(all_listings[113], RULES_SEARCH_PAGE['name']))

### 10. Scrape all search pages

1. build all urls
2. iteratively scrape them

In [None]:
# 1. build all urls
def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    
    return url_list

In [None]:
# safe function to extract all features from one page
def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [None]:
# 2. Iteratively scrape pages
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

In [None]:
# build a list of URLs
url_list = build_urls(airbnb_url)

In [None]:
url_list

In [None]:
# try for one page
base_features = process_search_pages(url_list[:1])

In [None]:
base_features

### 11. Look at it

https://github.com/x-technology/airbnb-analytics/blob/main/Part%201%20-%20Web%20Scraping/data_sample.csv

# All imports in one cell (just in case)

In [None]:
# all imports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import json
import time

import pandas as pd

from multiprocessing import Pool

import os