# BeautifulSoup4 使用

### 获取tag文本

In [23]:
import re
from bs4 import BeautifulSoup
from pprint import pprint


html = """<table class="diff" id="difflib_chg_to0__top" cellspacing="0" cellpadding="0" rules="groups">
          <tbody>
            <tr>
                <td class="diff_next"><a href="#difflib_chg_to0__3">n</a></td>
                <td class="diff_header" id="from0_3099">3099</td>
                <td nowrap="nowrap">&lt;div&nbsp;id="overall-toast"&nbsp;class="overall-toast&nbsp;overall-toast-hide"&gt;&lt;span&gt;&lt;/span&gt;&lt;/div&gt;&lt;script&nbsp;src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=2018052<span class="diff_sub">3</span>10<span class="diff_chg">00</span>"&gt;&lt;/script&gt;</td>
                <td class="diff_next"><a href="#difflib_chg_to0__3">n</a></td>
                <td class="diff_header" id="to0_3107">3107</td>
                <td nowrap="nowrap">&lt;div&nbsp;id="overall-toast"&nbsp;class="overall-toast&nbsp;overall-toast-hide"&gt;&lt;span&gt;&lt;/span&gt;&lt;/div&gt;&lt;script&nbsp;src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210<span class="diff_chg">942</span>"&gt;&lt;/script&gt;</td>
            </tr>
         </tbody>
         </table>"""

soup = BeautifulSoup(html, 'lxml')

def handle_line_NO(tag):
    line_a, line_b = 0, 0
    for td in tag.find_all('td', class_='diff_header'):
        if 'from' in td['id']:
            line_a = td.get_text()
        elif 'to' in td['id']:
            line_b = td.get_text()
    print({'line_a': line_a, 'line_b': line_b})
    
def handle_raw_text(tag): 
    raw_text_list = []
    for td in tag.find_all('td', attrs={'nowrap':'nowrap'}):
        raw_text = ''
        for string in td.strings:
            raw_text += string
        raw_text_list.append(raw_text)
    print({'text_a': raw_text_list[0], 'text_b': raw_text_list[1]})
    
def handle_change_text(tag):
    change_list = []
    for td in tag.find_all('td', attrs={'nowrap':'nowrap'}):
        changes = []
        for span in td.find_all('span'):
            changes.append({span['class'][0]: span.string})
        change_list.append(changes)
    print({'change_a': change_list[0], 'change_b': change_list[1]})

tr = soup.find('tr')
handle_line_NO(tr)
handle_raw_text(tr)
handle_change_text(tr)

pprint(tr.get_text().splitlines())

{'line_a': '3099', 'line_b': '3107'}
{'text_a': '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0overall-toast-hide"><span></span></div><script\xa0src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805231000"></script>', 'text_b': '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0overall-toast-hide"><span></span></div><script\xa0src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210942"></script>'}
{'change_a': [{'diff_sub': '3'}, {'diff_chg': '00'}], 'change_b': [{'diff_chg': '942'}]}
['',
 'n',
 '3099',
 '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0'
 'overall-toast-hide"><span></span></div><script\xa0'
 'src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805231000"></script>',
 'n',
 '3107',
 '<div\xa0id="overall-toast"\xa0class="overall-toast\xa0'
 'overall-toast-hide"><span></span></div><script\xa0'
 'src="//passport.baidu.com/passApi/js/uni_login_wrapper.js?cdnversion=201805210942"></script>']


### string, strings和get_text()

.string
如果tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点
如果一个tag仅有一个子节点,那么这个tag也可以使用 .string 方法,输出结果与当前唯一子节点的 .string 结果相同
如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容,
如果tag中包含多个字符串 [2] ,可以使用 .strings 来循环获取:
for string in soup.strings:
    print(repr(string))
    
输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白:
for string in soup.stripped_strings:
    print(repr(string))
    
get_text()
获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回