In [3]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
html_doc

'\n<html><head><title>The Dormouse\'s story</title></head>\n<body>\n<p class="title"><b>The Dormouse\'s story</b></p>\n\n<p class="story">Once upon a time there were three little sisters; and their names were\n<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,\n<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and\n<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>\n\n<p class="story">...</p>\n'

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [6]:
soup


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [7]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [8]:
soup.title

<title>The Dormouse's story</title>

In [9]:
soup.title.name

'title'

In [10]:
soup.title.parent.name

'head'

In [11]:
soup.title.parent.parent.name

'html'

In [12]:
soup.title.parent.parent.parent.name

'[document]'

In [13]:
soup.title.string

"The Dormouse's story"

In [14]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [15]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [16]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [17]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [18]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [28]:
for link in soup.find_all('a'):
    print(link.get('href'))
    print(link)
    print(link.string)
    print(link.get('id'))
    print('---')

http://example.com/elsie
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
link1
---
http://example.com/lacie
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
link2
---
http://example.com/tillie
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
link3
---


In [29]:
print(soup.get_text())


The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [33]:
BeautifulSoup("Sacr&eacute; bleu!")

<html><body><p>Sacré bleu!</p></body></html>

In [34]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')

In [35]:
tag = soup.b

In [36]:
type(tag)

bs4.element.Tag

In [37]:
print(soup.prettify())

<html>
 <body>
  <b class="boldest">
   Extremely bold
  </b>
 </body>
</html>


In [38]:
tag.name

'b'

In [39]:
tag.name = "blockquote"

In [40]:
tag.name

'blockquote'

In [41]:
tag['id']

KeyError: 'id'

In [42]:
tag.attrs

{'class': ['boldest']}

In [43]:
tag['class']

['boldest']

In [44]:
tag['id'] = 'verybold'
tag['another-attribute'] = 1

In [45]:
tag

<blockquote another-attribute="1" class="boldest" id="verybold">Extremely bold</blockquote>

In [46]:
del tag['id']

In [47]:
tag

<blockquote another-attribute="1" class="boldest">Extremely bold</blockquote>

In [48]:
css_soup = BeautifulSoup('<p class="body"></p>')

In [49]:
css_soup.p['class']

['body']

In [50]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')

In [51]:
css_soup.p['class']

['body', 'strikeout']

In [52]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']

['index']

In [53]:
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [59]:
tag.string

'Extremely bold'

In [60]:
type(tag.string)

bs4.element.NavigableString

In [64]:
unicode_string = str(tag.string)
print(unicode_string)
print(type(unicode_string))

Extremely bold
<class 'str'>


In [65]:
tag.string.replace_with("No longer bold")

'Extremely bold'

In [66]:
tag

<blockquote another-attribute="1" class="boldest">No longer bold</blockquote>

In [67]:
soup.name

'[document]'

In [68]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)

bs4.element.Comment

In [69]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
soup

<html><body><b><!--Hey, buddy. Want to buy a used parser?--></b></body></html>

In [70]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [71]:
from bs4 import CData

In [72]:
cdata = CData("A CDATA block")
comment.replace_with(cdata)

'Hey, buddy. Want to buy a used parser?'

In [73]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [76]:
from bs4 import CData
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
cdata = CData("A CDATA block")
comment.replace_with(cdata)
print(soup.b.prettify())

<b>
 <![CDATA[A CDATA block]]>
</b>


In [77]:
comment

'Hey, buddy. Want to buy a used parser?'

In [78]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [79]:
soup.head

<head><title>The Dormouse's story</title></head>

In [80]:
soup.title

<title>The Dormouse's story</title>

In [81]:
soup.body.b

<b>The Dormouse's story</b>

In [82]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [83]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [84]:
head_tag.contents


[<title>The Dormouse's story</title>]

In [85]:
title_tag = head_tag.contents[0]

In [86]:
title_tag

<title>The Dormouse's story</title>

In [87]:
len(soup.contents)

2

In [88]:
soup.contents

['\n', <html><head><title>The Dormouse's story</title></head>
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body></html>]

In [89]:
soup.contents[0]

'\n'

In [90]:
soup.contents[1]

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [91]:
title_tag.children

<list_iterator at 0x215856cd198>

In [92]:
title_tag

<title>The Dormouse's story</title>

In [93]:
for child in title_tag.children:
    print(child)

The Dormouse's story


In [94]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [95]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [96]:
head_tag

<head><title>The Dormouse's story</title></head>

In [97]:
title_tag.string


"The Dormouse's story"

In [98]:
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [99]:
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [100]:
title_tag = soup.title

In [101]:
title_tag

<title>The Dormouse's story</title>

In [110]:
title_tag.parent.parent.parent


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [112]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [113]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

In [114]:
print(soup.parent)

None


In [115]:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [116]:
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)
    

p
body
html
[document]


In [117]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>


In [123]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [124]:
sibling_soup.b.string

'text1'

In [125]:
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [126]:
link.next_sibling

',\n'

In [127]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [128]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [129]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [130]:
soup.find_all('b')

[<b>The Dormouse's story</b>]

In [131]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [132]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title


In [138]:
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [137]:
soup.find_all(["a", "b", "head"])

[<head><title>The Dormouse's story</title></head>,
 <b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]