# Chapter 2

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url1 = 'http://www.pythonscraping.com/pages/warandpeace.html'
html = urlopen(url1)
bs = BeautifulSoup(html.read(), 'html.parser')

In [3]:
# 使用find_all函数提取只包含在<span class='span'></span>标签里的文字
nameList = bs.find_all('span', {'class':"green"})
print(f'Length of namelist: {len(nameList)}')
for name in nameList:
    print(name.get_text())

Length of namelist: 41
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


In [4]:
h1List = bs.find_all('h1')
for i in h1List:
    print(i)
    print(i.get_text()) # get_text()方法会清除所有html标签，保留其中的文本
    print(type(i))

<h1>War and Peace</h1>
War and Peace
<class 'bs4.element.Tag'>


In [5]:
type(h1List)

bs4.element.ResultSet

## find()和find_all()

函数定义：
find_all(tag, attributes, recursive, string, limit, keywords)
find(tag, attributes, recursive, text, keywords)

In [6]:
# tag：一个标签名称或者多个标签的列表
titleList = bs.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
titleList

[<h1>War and Peace</h1>, <h2>Chapter 1</h2>]

In [7]:
# attributes：封装了一个标签的若干属性和对应属性值的字典
colorList = bs.find_all('span', {'class': ['green', 'red']})
colorList

[<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
 Buonapartes. But I warn you, if you don't tell me that this means war,
 if you still try to defend the infamies and horrors perpetrated by
 that Antichrist- I really believe he is Antichrist- I will have
 nothing more to do with you and you are no longer my friend, no longer
 my 'faithful slave,' as you call yourself! But how do you do? I see
 I have frightened you- sit down and tell me all the news.</span>,
 <span class="green">Anna
 Pavlovna Scherer</span>,
 <span class="green">Empress Marya
 Fedorovna</span>,
 <span class="green">Prince Vasili Kuragin</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">St. Petersburg</span>,
 <span class="red">If you have nothing better to do, Count [or Prince], and if the
 prospect of spending an evening with a poor invalid is not too
 terrible, I shall be very charmed to see you tonight between 7 and 10-
 Annette Scherer.</span>,
 <span clas

In [8]:
# recursive(bool)：是否递归查找（默认为True），一般无需修改
# string：匹配指定的文本内容，但该参数仅限于计数
textList = bs.find_all(string='the prince')
len(textList)

7

In [9]:
# limit：即获取html中的前k个结果
# find()即是k=1的find_all()
limitList = bs.find_all('span', {'class':'green'}, limit=3)
print(limitList)
# keyword(**kwargs)：选择具有指定属性的标签（不建议使用，可使用attr参数代替）

[<span class="green">Anna
Pavlovna Scherer</span>, <span class="green">Empress Marya
Fedorovna</span>, <span class="green">Prince Vasili Kuragin</span>]


## 导航树（Navigating Trees）

In [40]:
url2 = 'http://www.pythonscraping.com/pages/page3.html'  # 虚拟在线购物网站
html2 = urlopen(url2)
bs2 = BeautifulSoup(html2, 'html.parser')
#“孩子”标签为该标签的下一级标签，“后代”标签为该标签下属的所有标签（children 包含于 descendants）
# .children用于列出该Tag的所有“孩子”标签
children_count = 0
for child in bs2.find('table', {'id':'giftList'}).children: # type: ignore
    # print(child)
    children_count += 1
print(f"children_count: {children_count}")

# .descendants用于列出该Tag的所有“后代”标签
descendant_count = 0
for child in bs2.find('table', {'id':'giftList'}).descendants: # type: ignore
    # print(child)
    descendant_count += 1
print(f"descendant_count: {descendant_count}")

children_count: 13
descendant_count: 86


In [43]:
# .children是Tag对象（.find()返回值）的方法，他会返回一个迭代器
# 因此.find_all()是不能使用.children方法的
print(type(bs2.find('table', {'id':'giftList'}).children)) # type: ignore
print(type(bs2.find('table', {'id':'giftList'}).descendants)) # type: ignore

<class 'generator'>
<class 'generator'>


In [51]:
# 而find_all()返回的是ResultSet对象，其用法与List|Set类似
# 其中存储的每个元素均为Tag对象，可以使用.children方法
print('find_all():\t\t', type(bs2.find_all('table', {'id':'giftList'})))
print('find_all[0]:\t\t', type(bs2.find_all('table', {'id':'giftList'})[0]))
print('find_all[0].children:\t', type(bs2.find_all('table', {'id':'giftList'})[0].children)) # type: ignore

children_count = 0
for child in bs2.find_all('table', {'id':'giftList'})[0].children: # type: ignore
    children_count += 1
print(children_count)

find_all():		 <class 'bs4.element.ResultSet'>
find_all[0]:		 <class 'bs4.element.Tag'>
find_all[0].children:	 <class 'generator'>
13
