# BeautifulSoup

In [2]:
from bs4 import BeautifulSoup
import urllib.request

In [3]:
html = urllib.request.urlopen("http://www.baidu.com")
bs_obj = BeautifulSoup(html, 'html.parser', from_encoding='utf-8');
print("title tag: ", bs_obj.title)

title tag:  <title>百度一下，你就知道</title>


* 查找节点

In [5]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

#创建对象
bs_obj = BeautifulSoup(html_doc, 'html.parser')

* 按类型查找节点

In [6]:
# 提取所有链接
print('1. 提取所有链接')
link_list = bs_obj.find_all('a')
for link in link_list:
    print(link.name, link['href'], link.get_text())

1. 提取所有链接
a http://example.com/elsie Elsie
a http://example.com/lacie Lacie
a http://example.com/tillie Tillie


* 按属性查找节点

In [7]:
# 提取一条链接
print('2. 提取一条链接')
link = bs_obj.find('a', id='link2')
print(link.name, link['href'], link.get_text())

2. 提取一条链接
a http://example.com/lacie Lacie


In [8]:
# 再提取一条链接
print('3. 再提取一条链接')
link = bs_obj.find('a', class_='sister')
print(link.name, link['href'], link.get_text())

3. 再提取一条链接
a http://example.com/elsie Elsie


* 异常处理

In [12]:
# 服务器和URL的异常处理

try:
    html = urllib.request.urlopen("http://www.wandu.com")
except Exception as e:
    print(e)

<urlopen error [Errno 11004] getaddrinfo failed>


In [13]:
# HTML标签访问异常处理
try:
    tag_content = bs_obj.title.yy_tag
    #tag_content = bs_obj.xx_tag.yy_tag
except AttributeError as e:
    print("xx_tag 标签不存在")
else:
    if tag_content is None:
        print("yy_tag 标签不存在")
    else:
        print(tag_content)

yy_tag 标签不存在


In [14]:
# 创建一个完整的函数处理title

def get_html_title(url):
    """
        获取url地址的title
    """
    try:
        html = urllib.request.urlopen(url)
    except Exception as e:
        return None
    
    try:
        bs_obj = BeautifulSoup(html.read(), 'html.parser')
        title = bs_obj.title
    except Exception as e:
        return None
    
    return title

title = get_html_title("http://www.taobao.com")
if title is not None:
    print(title)
else:
    print("Title获取失败！")

<title>淘宝海外全球站 - 购物首选，淘你喜欢！</title>
