# BeautifulSoup

In [25]:
from bs4 import BeautifulSoup
import re

In [2]:
html = """
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
"""

基础部分

In [3]:
soup = BeautifulSoup(html, 'lxml')
print(soup.title.string.strip()) # 使用string来获取标签里面的字符串
print(soup.p.name) # name获取标签名称
print(soup.p.attrs) # attrs获取标签属性

The Dormouse's story
p
{'class': ['title'], 'name': 'dromouse'}


嵌套选择

In [4]:
print(soup.head.title.string.strip()) # 可以使用[标签1.标签2]的形式对标签1下一层节点进行选择

The Dormouse's story


关联选择(直接获取所选元素的子节点)

In [8]:
print(soup.p.contents)

['\n', <b>
    The Dormouse's story
   </b>, '\n']


In [15]:
print(soup.body.children)
for child in soup.body.children:
    print(child)
    print('---------')

<list_iterator object at 0xaddcbe70>


---------
<p class="title" name="dromouse">
<b>
    The Dormouse's story
   </b>
</p>
---------


---------
<p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
---------


---------
<p class="story">
   ...
  </p>
---------


---------


### find_all find_all(name, attrs, recursive, text, **kwargs)

(1)name

In [20]:
print(soup.find_all(name='b')) # name为标签类型

[<b>
    The Dormouse's story
   </b>]


(2)attrs

In [22]:
print(soup.find_all(attrs = {'class':'sister'})) # 根据标签属性选择标签

[<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>, <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>, <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>]


In [24]:
print(soup.find_all(class_ = 'sister')) # 使用标签属性名称加“_”效果相同

[<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>, <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>, <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>]


(3)text

In [41]:
print(soup.find_all('a', {'href': re.compile(r'http://(.*?)')})) # 匹配标签属性的方法

[<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>, <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>, <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>]


In [42]:
print(soup.find_all(text = re.compile('Dormouse'))) # 匹配标签内容的方法

["\n   The Dormouse's story\n  ", "\n    The Dormouse's story\n   "]


## CSS选择器

In [44]:
print(soup.select('.sister')) # 选择所有class为sister的标签

[<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>, <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>, <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>]
