[Reference](https://medium.com/@datajournal/how-to-parse-html-with-python-94495c11bc96)

# BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup
# Fetch the HTML content of the webpage
url = "https://example.com"
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Extract the title of the webpage
title = soup.title.text
print("Page Title:", title)

Page Title: Example Domain


# lxml

In [5]:
pip install lxml



In [6]:
from lxml import html
import requests
# Fetch the HTML content
url = "https://example.com"
response = requests.get(url)
# Parse the HTML content using lxml
tree = html.fromstring(response.content)
# Extract the title of the webpage
title = tree.findtext('.//title')
print("Page Title:", title)

Page Title: Example Domain


In [8]:
# Extract all links using XPath
links = tree.xpath('//a/@href')
for link in links:
    print(link)

https://www.iana.org/domains/example


# html.parser

In [10]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print("Start tag:", tag)
    def handle_endtag(self, tag):
        print("End tag:", tag)
    def handle_data(self, data):
        print("Data:", data)

# Sample HTML to parse
html_content = """
<html>
<head><title>Example</title></head>
<body><p>Hello, world!</p></body>
</html>
"""
# Create an instance of the parser and feed it the HTML content
parser = MyHTMLParser()
parser.feed(html_content)

Data: 

Start tag: html
Data: 

Start tag: head
Start tag: title
Data: Example
End tag: title
End tag: head
Data: 

Start tag: body
Start tag: p
Data: Hello, world!
End tag: p
End tag: body
Data: 

End tag: html
Data: 



# Selenium

In [11]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [13]:
from selenium import webdriver
from bs4 import BeautifulSoup
# Set up the Selenium driver (ensure you have a driver like ChromeDriver installed)
driver = webdriver.Chrome()
# Open the webpage
url = "https://example.com"
driver.get(url)
# Get the page source after JavaScript has loaded the content
html_content = driver.page_source
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Extract the title
title = soup.title.text
print("Page Title:", title)
# Close the Selenium driver
driver.quit()