### 多进程分布式： 加速爬虫

In [1]:
import multiprocessing as hmp
import time
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re

base_url = "https://morvanzhou.github.io"
# don't over crawl the website or you may never visit again
# 爬过的内容不爬第二次
if base_url != "https://morvanzhou.github.io":
    restricted_crawl = True
else:
    restricted_crawl = False

## Create a crawl function to open a url in parallel
### 创建一个抓取函数来并行打开文件

In [3]:
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)   # slightly delay for download
    return response.read().decode()

## Create a parse function to find all results we need in parallel
### 创建一个解析函数来并行寻找我们所需要的结果

In [4]:
def parse(html):
    soup = BeautifulSoup(html,'lxml')
    urls = soup.find_all('a',{"href": re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href']) for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title, page_urls, url
    

## Normal way
#### Do not use multiprocessing, test the speed. Firstly, set what urls we have already seen and what we haven't in a python set

In [7]:
unseen = set([base_url,])
seen = set()

count, t1 = 1,time.time()

while len(unseen)!=0:   # still get some url to visit
    if restricted_crawl and len(seen) > 20:
        break
        
    print('\nDistricted Crawling...')
    htmls = [crawl(url) for url in unseen]
    
    print('\nDistrcted Parsing...')
    results = [parse(html) for html in htmls]
    
    print('\nAnalysing...')
    seen.update(unseen)     # seen the crawled
    unseen.clear()          #nothing unseen
    
    for title, page_urls, url in results:
        print(count, title, url)
        count+=1
        unseen.update(page_urls - seen) # get new url to crawl
print('Total time: %.1f s' %(time.time()-t1, ))


Districted Crawling...

Distrcted Parsing...

Analysing...
1 教程 https://morvanzhou.github.io/

Districted Crawling...

Distrcted Parsing...

Analysing...
2 为了更优秀 https://morvanzhou.github.io/support/
3 高级爬虫: 让 Selenium 控制你的浏览器帮你爬 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-01-selenium/
4 Git 版本管理 教程系列 https://morvanzhou.github.io/tutorials/others/git/
5 Linux 简易教学 https://morvanzhou.github.io/tutorials/others/linux-basic/
6 迁移学习 Transfer Learning https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/5-16-transfer-learning/
7 迁移学习 Transfer Learning https://morvanzhou.github.io/tutorials/machine-learning/ML-intro/2-9-transfer-learning/
8 Pytorch 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/torch/
9 高级爬虫: 高效无忧的 Scrapy 爬虫库 https://morvanzhou.github.io/tutorials/data-manipulation/scraping/5-02-scrapy/
10 强化学习 Reinforcement Learning 教程系列 https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
11 Python基础 教程系列 https