Beautiful Soup transforms a complex HTML document into a complex tree of Python objects.

In [49]:
""" bs4 overview (1/5) """

bs4_documentation = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"

# pip install beautifulsoup4
from bs4 import BeautifulSoup

with open('index.html', 'r') as f:
    # create bs4 class instance
    doc = BeautifulSoup(f, "html.parser")

#print(doc.prettify())

# directly access html tags. Returns first tag in doc
# each returned bs4 object has the same methods as its parent bs4 object
title_tag = doc.title # accesses the first appearance of the "title" tag independently on how nested it is
print("# 1 - original title tag:")
print(title_tag.prettify())

print("# 2 - accessing tag attributes and contents:")
print("name: ", title_tag.name)
print("attributes: ", title_tag.attrs)
print("type attribute: ", title_tag['type'])
print("title tag contents 1: ", title_tag.string.strip())
print("title tag contents 2: ", title_tag.text.strip())


# it is possible to change the html as well
# changing a filtered tag element will change the original doc object
title_tag['type'] = 'hello'
title_tag['test'] = 'hello_again'
print("")
print("# 3 - altered title tag:")
print(doc.title.prettify())


# 1 - original title tag:
<title class="title class2" type="boldest">
 The Dormouse's story
</title>

# 2 - accessing tag attributes and contents:
name:  title
attributes:  {'class': ['title', 'class2'], 'type': 'boldest'}
type attribute:  boldest
title tag contents 1:  The Dormouse's story
title tag contents 2:  The Dormouse's story

# 3 - altered title tag:
<title class="title class2" test="hello_again" type="hello">
 The Dormouse's story
</title>



In [50]:
""" bs4 overview (2/5) """

# additional bs4 object methods
print("# contents: ", doc.body.p.contents)
print("# children: ", doc.body.p.children)
print("# descendants: ", doc.body.p.descendants)

print("")
print("# iterate through children:")
for child in doc.body.p.children:
    print("   ", child)

print("")
print("# iterate through descendents:")
for child in doc.body.p.descendants:
    print("   ", child)

# contents:  [<b>The Dormouse's</b>, <i>story</i>]
# children:  <list_iterator object at 0x00000210106D2580>
# descendants:  <generator object Tag.descendants at 0x00000210106D1970>

# iterate through children:
    <b>The Dormouse's</b>
    <i>story</i>

# iterate through descendents:
    <b>The Dormouse's</b>
    The Dormouse's
    <i>story</i>
    story


In [51]:
""" bs4 overview (3/5) """
# find() and find_all() - most important methods do learn!

body_tag = doc.body

print("find() result:")
res = body_tag.find("tr") # difference between this line and body_tag.tr ??
print(res)

print("")
print("find_all() result:")
res = body_tag.find_all("tr", limit=3) # difference between find(tag) and find_all(tag, limit=1) ??
print(res)

find() result:
<tr class="other">row 1: 77</tr>

find_all() result:
[<tr class="other">row 1: 77</tr>, <tr>row 2: 50</tr>, <tr>row 3: 6</tr>]


In [63]:
""" bs4 overview (4/5) """
# other arguments possibilities for find() and find_all() to further filter the search

# through attributes
a = body_tag.find_all(id="link2")

# search specific tag that has a given class
b = body_tag.find_all("a", class_="sister")

# search specific tag that has specific attributes
c = body_tag.find_all(attrs={"class": "other"})

# search specific tag that has specific attributes
d = body_tag.find_all("a", {"class": "sister", "id": "link1"})


In [65]:
for i, res in enumerate(c): print(i, ': ', res)

0 :  <a class="sister other" href="http://example.com/elsie" id="link1">
                Elsie
            </a>
1 :  <tr class="other">row 1: 77</tr>


In [71]:
""" bs4 overview (5/5) """
# regular expressions documentation: https://docs.python.org/3/library/re.html
import re

res = body_tag.find_all("tr", string=re.compile("^row"))
print(res)

print("")

res = body_tag.find("a", href=re.compile(".*tillie"))
print(res['href'])



[<tr class="other">row 1: 77</tr>, <tr>row 2: 50</tr>, <tr>row 3: 6</tr>]

http://example.com/tillie
