# BeautifulSoup4 使用

### 获取tag文本

In [23]:
import re
from bs4 import BeautifulSoup
from pprint import pprint


html = """<table class="diff" id="difflib_chg_to0__top" cellspacing="0" cellpadding="0" rules="groups">
          <tbody>
            <tr>
                <td class="diff_next"><a href="#difflib_chg_to0__3">n</a></td>
                <td class="diff_header" id="from0_3099">3099</td>
                <td nowrap="nowrap">&lt;div&nbsp;id="overall-toast"&nbsp;class="overall-toast&nbsp;overall-toast-hide"&gt;&lt;span&gt;&lt;/span&gt;&lt;/div&gt;&lt;script&nbsp;src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=2018052<span class="diff_sub">3</span>10<span class="diff_chg">00</span>"&gt;&lt;/script&gt;</td>
                <td class="diff_next"><a href="#difflib_chg_to0__3">n</a></td>
                <td class="diff_header" id="to0_3107">3107</td>
                <td nowrap="nowrap">&lt;div&nbsp;id="overall-toast"&nbsp;class="overall-toast&nbsp;overall-toast-hide"&gt;&lt;span&gt;&lt;/span&gt;&lt;/div&gt;&lt;script&nbsp;src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210<span class="diff_chg">942</span>"&gt;&lt;/script&gt;</td>
            </tr>
         </tbody>
         </table>"""

soup = BeautifulSoup(html, 'lxml')

def handle_line_NO(tag):
    line_a, line_b = 0, 0
    for td in tag.find_all('td', class_='diff_header'):
        if 'from' in td['id']:
            line_a = td.get_text()
        elif 'to' in td['id']:
            line_b = td.get_text()
    print({'line_a': line_a, 'line_b': line_b})
    
def handle_raw_text(tag): 
    raw_text_list = []
    for td in tag.find_all('td', attrs={'nowrap':'nowrap'}):
        raw_text = ''
        for string in td.strings:
            raw_text += string
        raw_text_list.append(raw_text)
    print({'text_a': raw_text_list[0], 'text_b': raw_text_list[1]})
    
def handle_change_text(tag):
    change_list = []
    for td in tag.find_all('td', attrs={'nowrap':'nowrap'}):
        changes = []
        for span in td.find_all('span'):
            changes.append({span['class'][0]: span.string})
        change_list.append(changes)
    print({'change_a': change_list[0], 'change_b': change_list[1]})

tr = soup.find('tr')
handle_line_NO(tr)
handle_raw_text(tr)
handle_change_text(tr)

pprint(tr.get_text().splitlines())

{'line_a': '3099', 'line_b': '3107'}
{'text_a': '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0overall-toast-hide"><span></span></div><script\xa0src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805231000"></script>', 'text_b': '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0overall-toast-hide"><span></span></div><script\xa0src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210942"></script>'}
{'change_a': [{'diff_sub': '3'}, {'diff_chg': '00'}], 'change_b': [{'diff_chg': '942'}]}
['',
 'n',
 '3099',
 '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0'
 'overall-toast-hide"><span></span></div><script\xa0'
 'src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805231000"></script>',
 'n',
 '3107',
 '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0'
 'overall-toast-hide"><span></span></div><script\xa0'
 'src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210942"></script>']


### string, strings和get_text()

.string
如果tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点
如果一个tag仅有一个子节点,那么这个tag也可以使用 .string 方法,输出结果与当前唯一子节点的 .string 结果相同
如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容,
如果tag中包含多个字符串 [2] ,可以使用 .strings 来循环获取:
for string in soup.strings:
    print(repr(string))
    
输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白:
for string in soup.stripped_strings:
    print(repr(string))
    
get_text()
获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回

### 判断标签是否含有某个attr

In [3]:
from bs4 import BeautifulSoup

html = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>欢迎使用 得仕通</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<script type="text/javascript">
function doSubmit(){
	document.subform.action = "/customer/preLogin.do";
	document.subform.submit();
}
</script>
<body onload="doSubmit()"></body>
<form name="subform" method="post"></form>
</html>
"""


soup = BeautifulSoup(html, 'lxml')
# tags = soup.find_all(True)
meta_tag = soup.meta
print(meta_tag.name)
print(meta_tag.attrs)
if 'content' in meta_tag.attrs:
    print('has content attr')


meta
{'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'}
has content attr


In [None]:
''''
BeautifulSoup 不仅支持 HTML 解析器,还支持一些第三方的解析器，如，lxml，XML，html5lib 但是需要安装相应的库。
要解析的文档类型: 目前支持, html, xml 和 html5
指定使用解析器: 目前支持, lxml, html5lib 和 html.parser
Beautiful Soup 将复杂 HTML 文档转换成一个复杂的树形结构,每个节点都是 Python 对象,
所有对象可以归纳为 4 种: Tag , NavigableString , BeautifulSoup , Comment.
Tag就是 HTML 中的一个个标签, Tag 有两个重要的属性: name 和 attrs, name 指标签的名字或者 tag 本身的 name，attrs 通常指一个标签的 class。
NavigableString：获取标签内部的文字，如，soup.p.string。
BeautifulSoup：表示一个文档的全部内容。
Comment：Comment 对象是一个特殊类型的 NavigableString 对象，其输出的内容不包括注释符号.
'''
from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""

# 可以传入一段字符串或一个文件句柄给BeautifulSoup 的构造方法,就能得到一个文档的对象
# soup 是获得文档的对象。然后,文档被转换成 Unicode ,并且 HTML 的实例都被转换成 Unicode 编码
soup = BeautifulSoup(html_doc, 'html.parser')                            
                                                                                  
print(soup.prettify())                                                  

<html>                                                                            
 <head>                                                                           
  <title>                                                                         
   The Dormouse's story                                                           
  </title>                                                                        
 </head>                                                                          
 <body>                                                                           
  <p class="title">                                                               
   <b>                                                                            
    The Dormouse's story                                                          
   </b>                                                                           
  </p>                                                                            
  <p class="story">                                                               
   Once upon a time there were three little sisters; and their names were         
   <a class="sister" href="http://example.com/elsie" id="link1">                  
    Elsie                                                                         
   </a>                                                                           
   ,                                                                              
   <a class="sister" href="http://example.com/lacie" id="link2">                  
    Lacie                                                                         
   </a>                                                                           
   and                                                                            
   <a class="sister" href="http://example.com/tillie" id="link3">                 
    Tillie                                                                        
   </a>                                                                           
   ; and they lived at the bottom of a well.                                      
  </p>                                                                            
  <p class="story">                                                               
   ...                                                                            
  </p>                                                                            
 </body>                                                                          
</html>                                                                           

# 返回整个title                                                                                  
In [21]: soup.title                                                               
Out[21]: <title>The Dormouse's story</title>                                      
# 返回标签名                                                                                  
In [22]: soup.title.name                                                          
Out[22]: 'title'                                                                  
# 返回标签内的文本                                                                                  
In [23]: soup.title.string                                                        
Out[23]: "The Dormouse's story"                                                   
# 返回父标签的名字                                                                                  
In [24]: soup.title.parent.name                                                   
Out[24]: 'head'                                                                   
                                                                                  
In [25]: soup.p                                                                   
Out[25]: <p class="title"><b>The Dormouse's story</b></p>                         
                                                                                  
In [26]: soup.p['class']                                                          
Out[26]: ['title']                                                                
                                                                                  
In [27]: soup.a                                                                   
Out[27]: <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>   
                                                                                  
In [28]: soup.find_all('a')                                                       
Out[28]:                                                                          
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,          
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,          
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]        
                                                                                  
In [29]: soup.find(id='link3')                                                    
Out[29]: <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> 