Beautiful Soup transforms a complex HTML document into a complex tree of Python objects.

In [18]:
""" bs4 overview (1/5) """

bs4_documentation = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"

# pip install beautifulsoup4
from bs4 import BeautifulSoup

with open('index.html', 'r') as f:
    # create bs4 class instance
    doc = BeautifulSoup(f, "html.parser")


print("# title tag:")
print(doc.title.prettify()) # directly access html tags. Returns first tag in doc

print("# accessing tag attributes and contents:")
print("name: ", doc.title.name)
print("attributes: ", doc.title.attrs)
print("type attribute: ", doc.title['type'])
print("title tag contents 1: ", doc.title.string.strip())

print("Nested tag: ", doc.body.p.b.u.string)

# title tag:
<title class="title class2" type="boldest">
 The Dormouse's story
</title>

# accessing tag attributes and contents:
name:  title
attributes:  {'class': ['title', 'class2'], 'type': 'boldest'}
type attribute:  boldest
title tag contents 1:  The Dormouse's story
Nested tag:  Dormouse's


In [19]:
""" bs4 overview (2/6) """

title_tag = doc.title

# it is possible to change the html as well
# changing a filtered tag element will change the original doc object

print("# 1 - original title tag:")
print(title_tag.prettify())
title_tag['type'] = 'hello'
title_tag['test'] = 'hello_again'


print("# 2 - altered title tag:")
print(doc.title.prettify())

# 1 - original title tag:
<title class="title class2" type="boldest">
 The Dormouse's story
</title>

# 2 - altered title tag:
<title class="title class2" test="hello_again" type="hello">
 The Dormouse's story
</title>



In [12]:
""" bs4 overview (3/6) """

p_tag = doc.body.p

# additional bs4 object methods
print("# contents: ", p_tag.contents)
print("# children: ", p_tag.children)
print("# descendants: ", p_tag.descendants)

print("")
print("# iterate through children:")
for child in p_tag.children:
    print("   ", child)

print("")
print("# iterate through descendents:")
for child in p_tag.descendants:
    print("   ", child)

# contents:  [<b>The <u>Dormouse's</u></b>, <i>story</i>, ' of fear']
# children:  <list_iterator object at 0x0000018A1C8CAFD0>
# descendants:  <generator object Tag.descendants at 0x0000018A1C91F900>

# iterate through children:
    <b>The <u>Dormouse's</u></b>
    <i>story</i>
     of fear

# iterate through descendents:
    <b>The <u>Dormouse's</u></b>
    The 
    <u>Dormouse's</u>
    Dormouse's
    <i>story</i>
    story
     of fear


In [13]:
""" bs4 overview (4/6) """
# find() and find_all() - most important methods to learn!

body_tag = doc.body

print("find() result:")
res = body_tag.find("tr")
print(res)

print("")
print("find_all() result:")
res = body_tag.find_all("tr", limit=3) # difference between find(tag) and find_all(tag, limit=1) ??
print(res)

find() result:
<tr class="other">row 1: 77</tr>

find_all() result:
[<tr class="other">row 1: 77</tr>, <tr>row 2: 50</tr>, <tr>row 3: 6</tr>]


In [14]:
""" bs4 overview (5/6) """
# other arguments possibilities for find() and find_all() to further filter the search

# search through attributes
a = body_tag.find_all(id="link2")

# search tags with specific attributes
b = body_tag.find_all(attrs={"class": "other"})

# search specific tag that has a specific attribute
c = body_tag.find_all("a", class_="sister")

# search specific tag that has specific attributes
d = body_tag.find_all("a", {"class": "sister", "id": "link1"})


In [15]:
for i, res in enumerate(a): print(i, ': ', res)

0 :  <a class="sister" href="http://example.com/lacie" id="link2">
                Lacie
            </a>


In [16]:
""" bs4 overview (6/6) """
# regular expressions documentation: https://docs.python.org/3/library/re.html
import re

pattern1 = re.compile("^row") # string that starts with "row"
res = body_tag.find_all(string=pattern1)
print(res)

print("")

pattern2 = re.compile("tillie$") # string that ends with "tillie"
res = body_tag.find_all(href=pattern2)
print(res[0]['href'])



['row 1: 77', 'row 2: 50', 'row 3: 6']

http://example.com/tillie
