Beautiful Soup transforms a complex HTML document into a complex tree of Python objects.

In [48]:
""" bs4 overview (1/5) """

bs4_documentation = "https://www.crummy.com/software/BeautifulSoup/bs4/doc/"

# pip install beautifulsoup4
from bs4 import BeautifulSoup

with open('index.html', 'r') as f:
    # create bs4 class instance
    doc = BeautifulSoup(f, "html.parser")

#print(doc.prettify())

# directly access html tags. Returns first tag in doc
# each returned bs4 object has the same methods as its parent bs4 object
title_tag = doc.title # accesses the first appearance of the "title" tag independently on how nested it is
print("# 1 - original title tag:")
print(title_tag.prettify())

print("# 2 - accessing tag attributes and contents:")
print("name: ", title_tag.name)
print("attributes: ", title_tag.attrs)
print("type attribute: ", title_tag['type'])
print("title tag contents 1: ", title_tag.string.strip())


# it is possible to change the html as well
# changing a filtered tag element will change the original doc object
title_tag['type'] = 'hello'
title_tag['test'] = 'hello_again'
print("")
print("# 3 - altered title tag:")
print(doc.title.prettify())


# 1 - original title tag:
<title class="title class2" type="boldest">
 The Dormouse's story
</title>

# 2 - accessing tag attributes and contents:
name:  title
attributes:  {'class': ['title', 'class2'], 'type': 'boldest'}
type attribute:  boldest
title tag contents 1:  The Dormouse's story

# 3 - altered title tag:
<title class="title class2" test="hello_again" type="hello">
 The Dormouse's story
</title>



In [49]:
""" bs4 overview (2/5) """

p_tag = doc.body.p

# additional bs4 object methods
print("# contents: ", p_tag.contents)
print("# children: ", p_tag.children)
print("# descendants: ", p_tag.descendants)

print("")
print("# iterate through children:")
for child in p_tag.children:
    print("   ", child)

print("")
print("# iterate through descendents:")
for child in p_tag.descendants:
    print("   ", child)

# contents:  [<b>The <u>Dormouse's</u></b>, <i>story</i>, ' of fear']
# children:  <list_iterator object at 0x0000023C0C0524C0>
# descendants:  <generator object Tag.descendants at 0x0000023C0DB7B2E0>

# iterate through children:
    <b>The <u>Dormouse's</u></b>
    <i>story</i>
     of fear

# iterate through descendents:
    <b>The <u>Dormouse's</u></b>
    The 
    <u>Dormouse's</u>
    Dormouse's
    <i>story</i>
    story
     of fear


In [50]:
""" bs4 overview (3/5) """
# find() and find_all() - most important methods to learn!

body_tag = doc.body

print("find() result:")
res = body_tag.find("tr")
print(res)

print("")
print("find_all() result:")
res = body_tag.find_all("tr", limit=3) # difference between find(tag) and find_all(tag, limit=1) ??
print(res)

find() result:
<tr class="other">row 1: 77</tr>

find_all() result:
[<tr class="other">row 1: 77</tr>, <tr>row 2: 50</tr>, <tr>row 3: 6</tr>]


In [51]:
""" bs4 overview (4/5) """
# other arguments possibilities for find() and find_all() to further filter the search

# search through attributes
a = body_tag.find_all(id="link2")

# search tags with specific attributes
b = body_tag.find_all(attrs={"class": "other"})

# search specific tag that has a specific attribute
c = body_tag.find_all("a", class_="sister")

# search specific tag that has specific attributes
d = body_tag.find_all("a", {"class": "sister", "id": "link1"})


In [54]:
for i, res in enumerate(a): print(i, ': ', res)

0 :  <a class="sister" href="http://example.com/lacie" id="link2">
                Lacie
            </a>


In [53]:
""" bs4 overview (5/5) """
# regular expressions documentation: https://docs.python.org/3/library/re.html
import re

pattern1 = re.compile("^row") # string that starts with "row"
res = body_tag.find_all(string=pattern1)
print(res)

print("")

pattern2 = re.compile("tillie$") # string that ends with "tillie"
res = body_tag.find_all(href=pattern2)
print(res[0]['href'])



['row 1: 77', 'row 2: 50', 'row 3: 6']

http://example.com/tillie
