# Chapter03 编写网络爬虫

六度分隔理论（英语：Six Degrees of Separation）认为世界上任何互不相识的两人，只需要很少的中间人就能够建立起联系。哈佛大学心理学教授斯坦利·米尔格拉姆于1967年根据这个概念做过一次连锁信实验，尝试证明平均只需要6步就可以联系任何两个互不相识的人。

## 遍历单个域名

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:SpecialPages
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Other_ventu

为了能够在维基百科页面进行正确的跳转，我们需要观察那些指向词条页面（不是指向其他内部页面）的链接所具有的特征。
1. 都在id是bodyContent的div标签里（确保不包含侧边栏、页眉和页脚的链接）
2. URL不包含冒号
3. URL都以/wiki/开头
为此设计正则表达式：
    ^(/wiki/)((?!:).)*$

In [2]:
import re

# 先用.find找到id为bodyContent的正文内容
# 然后使用.find找到正文中和正则表达式匹配的项
for link in bs.find('div', {'id': 'bodyContent'}).find_all(
    'a', href=re.compile('^(/wiki/)((?!:).)*$')
):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Tribeca_Festival
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Leading_man
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Diner_(1982_film)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/Balto_(film)
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Male_Actor_in_a_Miniseries_or_Television_Movie
/wiki/Michael_Strobl
/wiki/HBO
/wiki/Taking_Chance
/wiki/Fox_Broadcasting_Company
/wiki/The_F

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import random
import time
import re

random.seed(time.time()) # 设定随机种子
def getLinks(articleUrl):
    """
    获取articleUrl下所有词条页面链接
    """
    html = urlopen(f'http://en.wikipedia.org{articleUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id': 'bodyContent'}).find_all(
        'a', href=re.compile('^(/wiki/)((?!:).)*$'))

links = getLinks('/wiki/Kevin_Bacon')
print(len(links))

cnt = 0
# 不断遍历wiki百科词条（未进行数据处理，因此引入cnt以确保能够退出）
while len(links) > 0 and cnt < 5:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)
    cnt += 1

476
/wiki/Giffoni_Film_Festival
/wiki/France
/wiki/Secularism_in_France
/wiki/Radical_Whigs
/wiki/Classical_radicalism


## 3.2 抓取整个网站

In [4]:
# 爬取整个网站的信息
pages = set() # 使用集合存储网页，避免重复
def getLinks(pageUrl):
    # 获取pageUrl网站所有链接网址
    global pages
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                if random.randint(0, 1) == 0: # 随机跳出避免持续运行
                    return
                getLinks(newPage) # 使用递归获取所有链接
getLinks('')

/wiki/Main_Page


In [5]:
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen(f'http://en.wikipedia.org{pageUrl}')
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print('h1标题：', bs.h1.get_text()) # 获取标题
        print('正文第一段：', bs.find(id='mw-content-text').find_all('p')[0]) # 获取正文第一段
        print('修改按钮链接：', bs.find(id='ca-edit').find('a').attrs['href']) # 获取修改按钮链接（与原书有修改）
    except:
        print("页面缺少信息")

    # for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
    #     if 'href' in link.attrs:
    #         if link.attrs['href'] not in pages:
    #             newPage = link.attrs['href']
    #             print('-'*20)
    #             print(newPage)
    #             pages.add(newPage)
    #             getLinks(newPage)

getLinks('/wiki/Awards_ceremony')

h1标题： Awards ceremony
正文第一段： <p>
An <b>awards ceremony</b> is a type of <a href="/wiki/Ceremony" title="Ceremony">ceremony</a> where <a href="/wiki/Award" title="Award">awards</a> are given out. The ceremony may be arranged by a government organization, a society, a school, a trade association or even a company that specializes in running awards ceremonies. Typically a <a href="/wiki/Master_of_ceremonies" title="Master of ceremonies">master of ceremonies</a> presents award winners, speaks to the audience, entertains people, and generally keeps the ceremony moving.
</p>
修改按钮链接： /w/index.php?title=Awards_ceremony&action=edit


##  在互联网上抓取

In [6]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import time
import random

pages = set()
random.seed(time.time())
url = 'http://www.oreilly.com'
# url = 'http://www.baidu.com'

# 获取页面中所有内链的列表
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
                                    urlparse(includeUrl).netloc)
    # print(includeUrl)
    internalLinks = []
    # 找出所有以“/”开头的链接
    for link in bs.find_all('a',
                            href=re.compile(f'^(/|{includeUrl}.*)')):
        # 此处的正则表达式相较原书有修改
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks: # 避免重复保存
                if (link.attrs['href'].startswith('/')):    # 组合链接
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

In [7]:
# 获取页面中所有外链的列表
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    # 找出所有以http和www开头且不包含当前URL的链接
    for link in bs.find_all('a',
                            href=re.compile(f'^(http|www)((?!{excludeUrl}).)*$')):
        if (href:=link.attrs['href']) is not None:
            if href not in externalLinks:
                externalLinks.append(href)
    return externalLinks

In [8]:
# 测试
url = 'http://www.baidu.com'
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')

internalLinks = getInternalLinks(bs, url)
print('内链数量', len(internalLinks))
print(internalLinks)

externalLinks = getExternalLinks(bs, url)
print('外链数量', len(externalLinks))
print(externalLinks)

内链数量 17
['http://www.baidu.com//news.baidu.com/', 'http://www.baidu.com//www.hao123.com/', 'http://www.baidu.com//map.baidu.com/', 'http://www.baidu.com//live.baidu.com/', 'http://www.baidu.com//haokan.baidu.com/?sfrom=baidu-top', 'http://www.baidu.com//tieba.baidu.com/', 'http://www.baidu.com//xueshu.baidu.com/', 'http://www.baidu.com//www.baidu.com/more/', 'http://www.baidu.com//www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1', 'http://www.baidu.com//www.baidu.com/', 'http://www.baidu.com//home.baidu.com/', 'http://www.baidu.com//ir.baidu.com/', 'http://www.baidu.com//www.baidu.com/duty', 'http://www.baidu.com//help.baidu.com/', 'http://www.baidu.com//www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000001', 'http://www.baidu.com//beian.miit.gov.cn/', 'http://www.baidu.com//www.baidu.com/licence/']
外链数量 11
['https://top.baidu.com/board?platform=pc&sa=pcindex_entry', 'https://www.baidu.com/s?wd=%E6%80%BB%E4%B9%A6%E8%AE%B0%E7%9A%

In [9]:
def getRandomExternalLinks(startingPage):
    # 随机返回该网站的一个外链，若startingPage中无外链，则在该网站其他内链页面中寻找
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc) # 获取外链列表
    if len(externalLinks) == 0:
        # 若该页面无外链则递归地在该网站其他内链页面中寻找
        print('无外链')
        domain = '{}://{}'.format(urlparse(startingPage).scheme,
                                    urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)    # 获取内链列表
        return getRandomExternalLinks(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

In [11]:
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLinks(startingSite)
    print("Random external link is:", externalLink)
    followExternalOnly(str(externalLink)) # 递归调用，一直顺着外链“爬行”
    
followExternalOnly('http://oreilly.com')

Random external link is: https://itunes.apple.com/us/app/safari-to-go/id881697395
Random external link is: https://www.apple.com/iphone/
Random external link is: https://www.goldmansachs.com/terms-and-conditions/Apple-Card-Customer-Agreement.pdf
无外链


ValueError: empty range for randrange() (0, 0, 0)

## 附：urllib.parse解析

In [None]:
from urllib.parse import urlparse

url='http://user:pwd@domain:80/path;params?query=queryarg#fragment'

parsed_result=urlparse(url)

print('parsed_result 包含了',len(parsed_result),'个元素')
print(parsed_result)


parsed_result 包含了 6 个元素
ParseResult(scheme='http', netloc='user:pwd@domain:80', path='/path', params='params', query='query=queryarg', fragment='fragment')


In [None]:
print('scheme  :', parsed_result.scheme)
print('netloc  :', parsed_result.netloc)
print('path    :', parsed_result.path)
print('params  :', parsed_result.params)
print('query   :', parsed_result.query)
print('fragment:', parsed_result.fragment)
print('username:', parsed_result.username)
print('password:', parsed_result.password)
print('hostname:', parsed_result.hostname)
print('port    :', parsed_result.port)


scheme  : http
netloc  : user:pwd@domain:80
path    : /path
params  : params
query   : query=queryarg
fragment: fragment
username: user
password: pwd
hostname: domain
port    : 80
