### 基本使用

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.prettify())   #格式化代码
print(soup.title.string) 

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
The Dormouse's story


#### 标签选择器

In [30]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="story">Once upon a time there were three little sisters; and their names were</p>
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
# 选择元素
print(soup.title) 
print(type(soup.head))
print(soup.p)   #只输出第一个匹配结果


# 获取标签名称
print(soup.title.name)

# 获取属性值,以下两种等价
print(soup.p.attrs['class'])
print(soup.p['class'])

# 获取内容
print(soup.p.string)

# 嵌套选择
print(soup.head.title.string)

# 子节点和子孙节点
print("-----------子节点和子孙节点---------------")
print(soup.p.contents)
print(soup.p.children)  #迭代器   子节点
for i,child in enumerate(soup.p.children):
    print(i,child)

print(soup.p.descendants)  #迭代器   子孙借点
for i,child in enumerate(soup.p.descendants):
    print(i,child)

print("-----------父节点和祖先节点-----------------")
print(soup.span.parent)
print(soup.span.parents)   #获得祖先节点

print("----------获取兄弟结点---------------")
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))

<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
<p class="story">Once upon a time there were three little sisters; and their names were</p>
title
['story']
['story']
Once upon a time there were three little sisters; and their names were
The Dormouse's story
-----------子节点和子孙节点---------------
['Once upon a time there were three little sisters; and their names were']
<list_iterator object at 0x06CEB7F0>
0 Once upon a time there were three little sisters; and their names were
<generator object descendants at 0x06CE1480>
0 Once upon a time there were three little sisters; and their names were
-----------父节点和祖先节点-----------------
<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
<generator object parents at 0x06CE14B0>
----------获取兄弟结点---------------
[(0, ',\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' and\n'), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, ';\nand 

#### 标准选择器
##### find_all

In [41]:
html_doc = """
<div class="panel">
	<div class="panel-heading">
		<h4>hello</h4>
	</div>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ul class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ul>
	</div>
</div>
"""
# 标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')[0]))
for ul in soup.find_all('ul'):
    print(ul.find_all('li'))

# attrs
print("-----------attrs------------")
print(soup.find_all(attrs={'id':'list-1'}))
print(soup.find_all(attrs={'class':'element'}))
# 这两个是提供的比较方便的查询id  和  class。
print(soup.find_all(id="list-1"))
print(soup.find_all(class_='element'))

# text  根据文本的内容进行选择
print(soup.find_all(text="Foo"))

print("-----------find------------")
# find 返回单个元素
# find_parents()  find_parent()
# find_next_siblings() find_next_sibling()
# find_previous_siblings()
# find_all_next()  find_next()
# find_all_previous


[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
-----------attrs------------
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
['Foo', 'F

#### css选择器

In [52]:
html_doc = """
<div class="panel">
	<div class="panel-heading">
		<h4>hello</h4>
	</div>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ul class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ul>
	</div>
</div>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.select(".panel .panel-heading"))
print(soup.select('ul li'))
print(soup.select('.panel-body #list-1'))


# 获取属性
print('---------获取属性-------------')
print(soup.select('ul')[0]['id'])
print(soup.select('ul')[1].attrs['id'])

# 获取内容
print("------------获取内容-----------")
# 要想获取所有的，需要遍历  for
print(soup.select('li')[0].get_text())   

[<div class="panel-heading">
<h4>hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
---------获取属性-------------
list-1
list-2
------------获取内容-----------
Foo
