# 全站爬虫

### 爬虫引擎介绍

* 爬取指定站点所有链接及html,支持过滤
* 使用gevent和requests来实现协程异步高并发
* 使用生产者-消费者模式
* 使用mongodb作数据存储
* 性能1分钟500+请求
* 配置灵活

### 功能介绍

1. 支持总量控制
2. 支持深度控制
3. 支持cookies
4. 支持代理proxy
5. 支持GET和POST方式请求
6. 支持允许域名限制,支持域名通配符匹配
7. 支持关键字排除特定url
8. 支持HTML的content-length过滤
9. 自动过滤重复url
10. 支持history爬取解析和过滤

### 配置参数介绍

配置参数是一个dict, 默认参数如下(可更改):
```python
# 爬虫参数
PARAMS = {
    # 起始url
    'start_url': 'https://www.baidu.com',
    'start_url_request_method': 'GET',
    # url类型:
    'end_type': 'PC',
    # 爬取深度, if depth is not positive, then no depth limit
    'depth': -1,
    # 单个HTML的Content-Length为1M
    'content_length': 1 * 1024 * 1024,
    # 整站爬取总量限制, if amount is not positive, then no amount limit to urls
    'amount': 100,
    # 网络请求超时
    'timeout': 5,
    # 队列取数据超时
    'queue_timeout': 1,
    # 爬取延时
    'delay': -1,  # 暂不支持
    # 队列大小
    'queue_size': 500,
    # 域名允许控制
    'allowed_domains': ['www.baidu.com'],
    # 关键字排除
    'exclude_keywords': [],
    # 支持cookie
    'cookies': {}
}
```

### 如何使用

In [None]:
from crawler.allsitespider import AllSiteSpider

def crawl_all(params=None):
    allsiteSpider = Spider(params)
    allsiteSpider.crawl()

### 代码实现

In [None]:
# -*- coding:utf-8 -*-
import os
import re
import time
import logging
import logging.handlers
from pymongo import MongoClient
import gevent
from gevent import monkey, pool
from gevent.queue import Queue
from urllib.parse import urlparse, urljoin
from html import unescape
import requests
from bs4 import BeautifulSoup
from pprint import pprint
monkey.patch_socket()

# 爬虫参数
PARAMS = {
    # 起始url
    'start_url': 'https://www.baidu.com',
    #
    'start_url_request_method': 'GET',
    # url类型:
    'end_type': 'PC',
    # 并发数
    'concurrency': 5,
    # 爬取深度, if depth is not positive, then no depth limit
    'depth': -1,
    # 单个HTML的Content-Length为1M
    'content_length': 1 * 1024 * 1024,
    # 整站爬取总量限制, if amount is not positive, then no amount limit to urls
    'amount': 500,
    # 网络请求超时
    'timeout': 5,
    # 队列取数据超时
    'queue_timeout': 1,
    # 爬取延时
    'delay': -1,
    # 队列大小
    'queue_size': 500,
    # 域名允许控制
    'allowed_domains': ['www.baidu.com'],
    # 关键字排除
    'exclude_keywords': [],
    # 支持cookie
    'cookies': {}
}

HEADER = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate'
}


# MONGODB参数
MONGODB = {
    "user": "xxxx",
    "passwd": "xxxx",
    "host": "127.0.0.1:27017",
    "dbname": "xxxxx"
}


def init_root_logger_settings(log_name='spiders', logConsole=True):
    LOG_FORMAT = "%(asctime)s [%(levelname)s] [%(filename)s] [%(lineno)d]: %(message)s"
    log_dir = os.path.join(os.path.dirname(
        os.path.dirname(os.path.abspath(__file__))), "logs")
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt=LOG_FORMAT, datefmt="%m/%d/%Y %H:%M:%S")

    fh = logging.handlers.TimedRotatingFileHandler(filename=os.path.join(log_dir, log_name),
                                                   when='midnight', interval=1, encoding='utf-8')
    fh.setLevel(logging.INFO)
    fh.suffix = "%Y-%m-%d.log"
    fh.setFormatter(formatter)
    root_logger.addHandler(fh)

    if logConsole:
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        root_logger.addHandler(ch)


def connect_mongo(MONGODB):
    client = MongoClient(
        'mongodb://{}:{}@{}/{}'.format(MONGODB['user'],
                                       MONGODB['passwd'],
                                       MONGODB['host'],
                                       MONGODB['dbname']))
    return client[MONGODB['dbname']]


class Spider(object):

    def __init__(self, params):
        assert isinstance(params, dict)
        self.init_spider_params(params)
        self.manager = UrlManager(self.params)
        self.downloader = HtmlDownloader(self.params)
        self.parser = HtmlParser(self.params)
        self.data_item = DataItem(self.params)
        self.urlQ = Queue(maxsize=self.params.get('queue_size'))
        self.respQ = Queue(maxsize=self.params.get('queue_size'))
        self.request_num = 0
        self.response_num = 0

    def init_spider_params(self, params):
        self.params = PARAMS
        if params:
            self.params.update(params)

    def is_running(self):
        if self.parser.stop_parse:
            if not self.urlQ.empty() or not self.respQ.empty():
                is_running = True
            else:
                is_running = False
        else:
            is_running = True
        return is_running

    def init_task(self):
        start_url = self.params.get('start_url')
        url_item = self.manager.patch_url(start_url)
        self.parser.all_urls.add(url_item.get('url'))
        self.urlQ.put_nowait(url_item)

    def consume_task(self):
        try:
            url_item = self.urlQ.get(timeout=self.params.get('queue_timeout'))
            self.request_num += 1
            resp_list = self.downloader.get_response(url_item)
            for response in resp_list:
                self.respQ.put_nowait(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    def produce_task(self):
        try:
            response = self.respQ.get(timeout=self.params.get('queue_timeout'))
            self.response_num += 1
            url_items = self.parser.parse(response)
            for url_item in url_items:
                if not self.parser.stop_parse:
                    self.urlQ.put_nowait(url_item)
                else:
                    break
            self.data_item.save(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    def crawl(self):
        tp = pool.Pool(50)
        logging.info('Spider start crawl.')
        logging.info('Init first task.')
        tp.add(gevent.spawn(self.init_task))
        while self.is_running():
            try:
                if self.urlQ.qsize() == self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                elif self.urlQ.qsize() > self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                else:
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                    tp.add(gevent.spawn(self.produce_task))
            except Exception as e:
                logging.error('corutine error: {}'.format(e))
        tp.join()
        logging.info('Request num: {}.'.format(self.request_num))
        logging.info('Response num: {}.'.format(self.response_num))
        logging.info('History num: {}.'.format(self.downloader.history_num))
        logging.info('Abstract urls: {}.'.format(self.parser.all_abstract_urls))
        logging.info('Filters urls: {}.'.format(self.parser.all_passed_urls))
        logging.info('Error urls: {}.'.format(self.downloader.error_urls + self.data_item.error_urls))
        logging.info('All urls: {}.'.format(len(self.parser.all_urls)))
        logging.info('Spider closed.')


class UrlManager(object):

    def __init__(self, params):
        self.params = params

    def normal_url(self, url, base_url):
        new_url = unescape(url.strip())
        if not re.match('(http|https)://', new_url):
            new_url = urljoin(base_url, new_url)
        return new_url[:-1] if new_url.endswith('/') else new_url

    def patch_url(self, url, method='GET', data=None, parent_url_obj=None):
        url_item = {}
        url_item['base_url'] = parent_url_obj.get(
            'url') if parent_url_obj else ''
        url_item['depth'] = parent_url_obj.get(
            'depth') + 1 if parent_url_obj else 1
        url_item['url'] = self.normal_url(url, url_item.get('base_url'))
        url_item['domain'] = urlparse(url_item.get('url')).netloc
        url_item['end_type'] = self.params.get('end_type')
        url_item['method'] = method
        url_item['data'] = data

        return url_item


class HtmlDownloader(object):

    def __init__(self, params):
        self.params = params
        self.error_urls = 0
        self.history_num = 0

    def get_headers(self):
        return get_header(self.params.get('end_type'))

    def get_proxy(self):
        return self.params.get('proxy')

    def get_cookies(self):
        return self.params.get('cookies')

    def get_timeout(self):
        return self.params.get('timeout')

    def download(self, url_item):
        url, method, data = url_item.get('url'), url_item.get(
            'method'), url_item.get('data')
        try:
            if method == 'GET':
                response = requests.get(url, headers=self.get_headers(),
                                        proxies=self.get_proxy(), stream=True,
                                        cookies=self.get_cookies(),
                                        timeout=self.get_timeout())
            else:
                response = requests.post(url, headers=self.get_headers(),
                                         proxies=self.get_proxy(), data=data,
                                         stream=True,
                                         cookies=self.get_cookies(),
                                         timeout=self.get_timeout())
            # 过滤content-length大于1M的html下载链接
            if int(response.headers.get('Content-Length', 0)) < self.params.get('content_length'):
                return response
            else:
                logging.warning('Content length > 1M, url: {}'.format(url))
            return response
        except Exception as e:
            logging.error('Download html error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1

    def get_response(self, url_item):
        resp_list = []
        response = self.download(url_item)
        if response:
            resp = {}
            resp['url_item'] = url_item
            resp['response'] = response
            resp_list.append(resp)
            for history_item in response.history:
                resp = {}
                resp['url_item'] = url_item
                resp['response'] = history_item
                resp_list.append(resp)
                self.history_num += 1
        return resp_list


class HtmlParser(object):

    def __init__(self, params):
        self.params = params
        self.manager = UrlManager(self.params)
        # 所有不重复url的集合,包括不符合过滤条件的url,该集合仅用来过滤重复urls,并不是实际请求的urls数目
        self.all_urls = set()
        # 停止解析urls
        self.stop_parse = False
        # 被过滤掉的urls数目
        self.all_passed_urls = 0
        # 提取到的urls数目
        self.all_abstract_urls = 0
        self.parse_url_list = []

    def filter_url(self, url_item):
        url = url_item.get('url')
        depth = url_item.get('depth')
        amount = self.params.get('amount')
        # 如果使用总量限制,且请求url超出总量设置,则过滤并停止提取url
        if amount > 0 and self.all_abstract_urls > amount:
            self.stop_parse = True
            logging.info('Current url amount {} > {}, stop parse urls.'.format(
                self.all_abstract_urls, self.params.get('amount')))
            return True

        # 支持url深度限制
        if self.params.get('depth') > 0 and depth > self.params.get('depth'):
            self.stop_parse = True
            logging.info('Current url depth {} > {}, stop parse urls.'.format(
                depth, self.params.get('depth')))
            return True

        # 支持域名过滤url
        domain = urlparse(url).netloc
        for allowed_domain in self.params.get('allowed_domains'):
            if not fnmatch(domain, allowed_domain):
                return True

        # 支持排除关键字过滤url
        for keyword in self.params.get('exclude_keywords'):
            if keyword in url:
                return True

        # 过滤重复url
        if url in self.all_urls:
            return True
        else:
            self.all_urls.add(url)

        return False

    def parse_form_data(self, tag):
        data = {}
        for input in tag.find_all('input'):
            name = input.get('name')
            if name and input.get('type') in ['text', 'password']:
                data[name] = input.get('value', '')
            elif input.get('type') == 'submit':
                name = 'submit'
                data[name] = input.get('value', '')
            else:
                if data.get(name) is None:
                    data[name] = list(input.get('value', ''))
                else:
                    data[name].append(input.get('value', ''))
        return data

    def abstract_urls(self, response):
        url_items = []
        url_item = response.get('url_item')
        resp = response.get('response')
        soup = BeautifulSoup(resp.content, 'lxml')
        tags = soup.find_all(True)
        for tag in tags:
            if self.stop_parse:
                logging.info('Stop abstract urls.')
                break
            method, data = 'GET', None
            if tag.name == 'form':
                url = tag.get('action', '')
                method = tag.get('method')
                data = self.parse_form_data(tag)
            elif tag.name == 'script':
                url = tag.get('src', '')
            else:
                url = tag.get('href', '')
            sub_url_item = self.manager.patch_url(
                url, method=method, data=data, parent_url_obj=url_item)
            if not self.filter_url(sub_url_item):
                url_items.append(sub_url_item)
                self.parse_url_list.append(sub_url_item.get('url'))
            else:
                self.all_passed_urls += 1
        return url_items

    def parse(self, response):
        url_items = []
        if response and not self.stop_parse:
            url_items = self.abstract_urls(response)
        self.all_abstract_urls += len(url_items)
        return url_items


class DataItem(object):

    def __init__(self, params):
        self.params = params
        db = ConnectMongo().db
        self.movie = db['crawler_urls']
        self.error_urls = 0
        # 入库前再清洗一遍相同的url
        self.res_urls = set()
        self.duplicate_urls = 0

    def handle_url(self, url):
        url_obj = urlparse(url)
        port = 443 if url_obj.scheme == 'https' else 80
        if ':80' in url_obj.netloc or ':443' in url_obj.netloc:
            new_netloc = url_obj.netloc
        else:
            new_netloc = '{}:{}'.format(url_obj.netloc, port)

        if url_obj.params:
            new_params = '?{}'.format(url_obj.params)
        else:
            new_params = url_obj.params

        if url_obj.query:
            new_query = '?{}'.format(url_obj.query)
        else:
            new_query = url_obj.query

        new_url = '{}://{}{}{}{}{}'.format(url_obj.scheme, new_netloc,
                                           url_obj.path, new_params,
                                           new_query, url_obj.fragment)
        return new_url

    def handle_title(self, data):
        title = ''
        response = data.get('response')
        html = response.content
        if html:
            soup = BeautifulSoup(html, 'lxml')
            title_tag = soup.title
            title = title_tag.get_text() if title_tag else ''
        return title

    def handle_data(self, data):
        response = data.get('response')
        request, resp = {}, {}
        request['headers'] = response.request.headers
        request['url'] = response.request.url
        request['method'] = response.request.method

        resp['headers'] = response.headers
        resp['content'] = response.content
        resp['status_code'] = response.status_code
        url = self.handle_url(request['url'])
        resp['url'] = url
        if url not in self.res_urls:
            self.res_urls.add(url)
            return request, resp
        else:
            self.duplicate_urls += 1
            return None


    def save(self, response):
        doc = {}
        url_item = response.get('url_item')
        try:
            data = self.handle_data(response)
            if data:
                request, resp = data[0], data[1]
                title = self.handle_title(response)
                doc['request'] = request
                doc['response'] = resp
                doc['title'] = title
                doc['site'] = self.params.get('start_url')
                doc['end_type'] = self.params.get('end_type')
                doc['time'] = int(time.time() * 1000)
                self.movie.insert(doc)
        except Exception as e:
            logging.error('Save data to mongodb error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1


if __name__ == '__main__':
    spider = Spider(PARAMS)
    spider.crawl()


05/07/2018 15:55:21 [INFO] [allspider.py] [173]: Spider start crawl.
05/07/2018 15:55:21 [INFO] [allspider.py] [174]: Init first task.
05/07/2018 15:55:24 [INFO] [allspider.py] [319]: Current url amount 546 > 500, stop parse urls.
05/07/2018 15:55:24 [INFO] [allspider.py] [372]: Stop abstract urls.
05/07/2018 15:55:30 [ERROR] [allspider.py] [276]: Download html error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
05/07/2018 15:55:30 [ERROR] [allspider.py] [277]: Error url info: {'base_url': 'http://www.baidu.com/more', 'depth': 3, 'url': 'http://www.baidu.com/s', 'domain': 'www.baidu.com', 'end_type': 'PC', 'method': None, 'data': {'bs': ['l', 'v'], 'f': ['8'], 'rsv_bp': ['1'], 'rsv_spt': ['3'], 'wd': [], 'submit': '百度一下'}}
05/07/2018 15:55:45 [ERROR] [allspider.py] [276]: Download html error: HTTPSConnectionPool(host='gupiao.baidu.com', port=443): Read timed out. (read timeout=5)
05/07/2018 15:55:45 [ERROR] [allspider.py] [277]: Error url info: {'base_url': 'https://www.baidu.com/s?wd=%E5%87%A4%E5%87%B0%E7%BD%91&tn=SE_PclogoS_8whnvm25&usm=2&ie=utf-8&rsv_cq=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&rsv_dl=0_right_recommends_merge_20826&euri=9c5765965dd84ec5980f63887e9e4881', 'depth': 4, 'url': 'http://www.baidu.com/link?url=ooX2ACdw5niMsKgSaUSqoIq0lt_NrSLXFM9Wpfq7WbHCBBydeIn69l6JObBWGlWuCw77KNsB7cYHm3Li8VlShLMwlxoidSK-uhRu1h8uOYy', 'domain': 'www.baidu.com', 'end_type': 'PC', 'method': 'GET', 'data': None}
05/07/2018 15:56:26 [ERROR] [allspider.py] [467]: Save data to mongodb error: HTTPConnectionPool(host='image.baidu.com', port=80): Read timed out.
05/07/2018 15:56:26 [ERROR] [allspider.py] [468]: Error url info: {'base_url': 'https://www.baidu.com/s?wd=%E7%99%BE%E5%BA%A6%E5%A5%BD%E7%9C%8B&tn=SE_PclogoS_8whnvm25&usm=2&ie=utf-8&rsv_cq=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&rsv_dl=0_right_recommends_merge_20826&euri=6b0794120dec11e68e38008cfaeb7e18', 'depth': 4, 'url': 'http://www.baidu.com/link?url=nwwUQ8I1ZkG9-pfL9CPKMUoz1pJrfZkS8zhtlqZ3Xn9akBV5XmISb_KOe-XBD3wy', 'domain': 'www.baidu.com', 'end_type': 'PC', 'method': 'GET', 'data': None}
05/07/2018 15:56:31 [INFO] [allspider.py] [196]: Create 1404 corutines.
05/07/2018 15:56:31 [INFO] [allspider.py] [197]: Request num: 544.
05/07/2018 15:56:31 [INFO] [allspider.py] [198]: Response num: 767.
05/07/2018 15:56:31 [INFO] [allspider.py] [199]: History num: 225.
05/07/2018 15:56:31 [INFO] [allspider.py] [201]: Abstract urls: 546.
05/07/2018 15:56:31 [INFO] [allspider.py] [202]: Filters urls: 12423.
05/07/2018 15:56:31 [INFO] [allspider.py] [203]: Error urls: 3.
05/07/2018 15:56:31 [INFO] [allspider.py] [204]: All urls: 547.
05/07/2018 15:56:31 [INFO] [allspider.py] [205]: Spider closed.

### 全站爬虫使用asyncio

In [None]:
# -*- coding:utf-8 -*-
import re
import os
import time
import logging
import logging.handlers
import asyncio
import requests
from html import unescape
from fnmatch import fnmatch
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from pymongo import MongoClient

import traceback
from pprint import pprint

# 爬虫参数
PARAMS = {
    # 起始url
    'start_url': 'https://www.baidu.com',
    # 'start_url': 'http://2code.top/gbk.php',
    #
    'start_url_request_method': 'GET',
    # url类型:
    'end_type': 'PC',
    # 爬取深度, if depth is not positive, then no depth limit
    'depth': -1,
    # 单个HTML的Content-Length为1M
    'content_length': 1 * 1024 * 1024,
    # 整站爬取总量限制, if amount is not positive, then no amount limit to urls
    'amount': 500,
    # 网络请求超时
    'timeout': 5,
    # 队列取数据超时
    'queue_timeout': 1,
    # 爬取延时
    'delay': -1,
    # 队列大小
    'queue_size': 500,
    # 域名允许控制
    'allowed_domains': ['www.baidu.com'],
    # 'allowed_domains': [],
    # 关键字排除
    'exclude_keywords': [],
    # 支持cookie
    'cookies': {}
}

HEADER = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate'
}

# MONGODB参数
MONGODB = {
    "user": "xxx",
    "passwd": "xxxx",
    "host": "127.0.0.1:27017",
    "dbname": "xxxx"
}


def init_root_logger_settings(log_name='spiders', logConsole=True):
    LOG_FORMAT = "%(asctime)s [%(levelname)s] [%(filename)s] [%(lineno)d]: %(message)s"
    log_dir = os.path.join(os.path.dirname(
        os.path.dirname(os.path.abspath(__file__))), "logs")
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt=LOG_FORMAT, datefmt="%m/%d/%Y %H:%M:%S")

    fh = logging.handlers.TimedRotatingFileHandler(filename=os.path.join(log_dir, log_name),
                                                   when='midnight', interval=1, encoding='utf-8')
    fh.setLevel(logging.INFO)
    fh.suffix = "%Y-%m-%d.log"
    fh.setFormatter(formatter)
    root_logger.addHandler(fh)

    if logConsole:
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        root_logger.addHandler(ch)


def connect_mongo(MONGODB):
    client = MongoClient(
        'mongodb://{}:{}@{}/{}'.format(MONGODB['user'],
                                       MONGODB['passwd'],
                                       MONGODB['host'],
                                       MONGODB['dbname']))
    return client[MONGODB['dbname']]


class Spider(object):

    def __init__(self, params):
        assert isinstance(params, dict)
        self.init_spider_params(params)
        self.manager = UrlManager(self.params)
        self.downloader = HtmlDownloader(self.params)
        self.parser = HtmlParser(self.params)
        self.data_item = DataItem(self.params)
        self.urlQ = asyncio.Queue(maxsize=self.params.get('queue_size'))
        self.respQ = asyncio.Queue(maxsize=self.params.get('queue_size'))
        self.request_num = 0
        self.response_num = 0

    def init_spider_params(self, params):
        self.params = PARAMS
        if params:
            self.params.update(params)

    def is_running(self):
        if self.parser.stop_parse:
            if not self.urlQ.empty() or not self.respQ.empty():
                is_running = True
            else:
                is_running = False
        else:
            is_running = True
        return is_running

    async def init_task(self):
        start_url = self.params.get('start_url')
        url_item = self.manager.patch_url(start_url)
        self.parser.all_urls.add(url_item.get('url'))
        self.urlQ.put_nowait(url_item)

    async def consume_task(self):
        try:
            url_item = await asyncio.wait_for(self.urlQ.get(), self.params.get('queue_timeout'))
            self.request_num += 1
            resp_list = await self.downloader.get_response(url_item)
            for response in resp_list:
                self.respQ.put_nowait(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    async def produce_task(self):
        try:
            response = await asyncio.wait_for(self.respQ.get(), self.params.get('queue_timeout'))
            self.response_num += 1
            url_items = await self.parser.parse(response)
            for url_item in url_items:
                if not self.parser.stop_parse:
                    self.urlQ.put_nowait(url_item)
                else:
                    break
            await self.data_item.save(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    async def tasks(self, loop):
        logging.info('Start to create consume and produce tasks.')
        self.task_num = 0
        while self.is_running():
            task_list = []
            try:
                if self.urlQ.qsize() == self.respQ.qsize():
                    consumer = loop.create_task(self.consume_task())
                    producer = loop.create_task(self.produce_task())
                    task_list.append(consumer)
                    task_list.append(producer)
                    self.task_num += 2
                elif self.urlQ.qsize() > self.respQ.qsize():
                    consumer1 = loop.create_task(self.consume_task())
                    consumer2 = loop.create_task(self.consume_task())
                    producer = loop.create_task(self.produce_task())
                    task_list.append(consumer1)
                    task_list.append(consumer2)
                    task_list.append(producer)
                    self.task_num += 3
                else:
                    consumer = loop.create_task(self.consume_task())
                    producer1 = loop.create_task(self.produce_task())
                    producer2 = loop.create_task(self.produce_task())
                    task_list.append(consumer)
                    task_list.append(producer1)
                    task_list.append(producer2)
                    self.task_num += 3
            except Exception as e:
                logging.error('corutine error: {}'.format(e))

            await asyncio.gather(*task_list, return_exceptions=True)
        # await asyncio.wait(task_list)

    def crawl(self):
        logging.info('Spider started!')
        start_time = datetime.now()
        loop = asyncio.get_event_loop()
        try:
            loop.run_until_complete(self.init_task())
            loop.run_until_complete(self.tasks(loop))
        except KeyboardInterrupt:
            for task in asyncio.Task.all_tasks():
                task.cancel()
            loop.stop()
            loop.run_forever()
        finally:
            end_time = datetime.now()
            logging.info('Request num: {}.'.format(self.request_num))
            logging.info('Response num: {}.'.format(self.response_num))
            logging.info('History num: {}.'.format(self.downloader.history_num))
            logging.info('Abstract urls: {}.'.format(self.parser.all_abstract_urls))
            logging.info('Filters urls: {}.'.format(self.parser.all_passed_urls))
            logging.info('Error urls: {}.'.format(self.downloader.error_urls + self.data_item.error_urls))
            logging.info('All urls: {}.'.format(len(self.parser.all_urls)))
            logging.info('Time usage: {}'.format(end_time - start_time))
            logging.info('Spider finished!')
            logging.info('Create {} tasks'.format(self.task_num))
            loop.close()


class UrlManager(object):

    def __init__(self, params):
        self.params = params

    def normal_url(self, url, base_url):
        new_url = unescape(url.strip())
        if not re.match('(http|https)://', new_url):
            new_url = urljoin(base_url, new_url)
        return new_url[:-1] if new_url.endswith('/') else new_url

    def patch_url(self, url, method='GET', data=None, parent_url_obj=None):
        url_item = {}
        url_item['base_url'] = parent_url_obj.get(
            'url') if parent_url_obj else ''
        url_item['depth'] = parent_url_obj.get(
            'depth') + 1 if parent_url_obj else 1
        url_item['url'] = self.normal_url(url, url_item.get('base_url'))
        url_item['domain'] = urlparse(url_item.get('url')).netloc
        url_item['end_type'] = self.params.get('end_type')
        url_item['method'] = method
        url_item['data'] = data

        return url_item


class HtmlDownloader(object):

    def __init__(self, params):
        self.params = params
        self.error_urls = 0
        self.history_num = 0

    def get_headers(self):
        return HEADER
        # return get_header(self.params.get('end_type'))

    def get_proxy(self):
        return self.params.get('proxy')

    def get_cookies(self):
        return self.params.get('cookies')

    def get_timeout(self):
        return self.params.get('timeout')

    async def download(self, url_item):
        url, method, data = url_item.get('url'), url_item.get(
            'method'), url_item.get('data')
        try:
            if method == 'GET':
                response = requests.get(url, headers=self.get_headers(),
                                        proxies=self.get_proxy(), stream=True,
                                        cookies=self.get_cookies(),
                                        timeout=self.get_timeout())
            else:
                response = requests.post(url, headers=self.get_headers(),
                                         proxies=self.get_proxy(), data=data,
                                         stream=True,
                                         cookies=self.get_cookies(),
                                         timeout=self.get_timeout())
            # 过滤content-length大于1M的html下载链接
            if int(response.headers.get('Content-Length', 0)) < self.params.get('content_length'):
                return response
            else:
                logging.warning('Content length > 1M, url: {}'.format(url))
        except Exception as e:
            logging.error('Download html error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1

    async def get_response(self, url_item):
        resp_list = []
        response = await self.download(url_item)
        if response:
            resp = {}
            resp['url_item'] = url_item
            resp['response'] = response
            resp_list.append(resp)
            for history_item in response.history:
                resp = {}
                resp['url_item'] = url_item
                resp['response'] = history_item
                resp_list.append(resp)
                self.history_num += 1
        return resp_list


class HtmlParser(object):

    def __init__(self, params):
        self.params = params
        self.manager = UrlManager(self.params)
        # 所有不重复url的集合,包括不符合过滤条件的url,该集合仅用来过滤重复urls,并不是实际请求的urls数目
        self.all_urls = set()
        # 停止解析urls
        self.stop_parse = False
        # 被过滤掉的urls数目
        self.all_passed_urls = 0
        # 提取到的urls数目
        self.all_abstract_urls = 0
        self.parse_url_list = []

    def filter_url(self, url_item):
        url = url_item.get('url')
        depth = url_item.get('depth')
        amount = self.params.get('amount')
        # 如果使用总量限制,且请求url超出总量设置,则过滤并停止提取url
        if amount > 0 and self.all_abstract_urls > amount:
            self.stop_parse = True
            logging.info('Current url amount {} > {}, stop parse urls.'.format(
                self.all_abstract_urls, self.params.get('amount')))
            return True

        # 支持url深度限制
        if self.params.get('depth') > 0 and depth > self.params.get('depth'):
            self.stop_parse = True
            logging.info('Current url depth {} > {}, stop parse urls.'.format(
                depth, self.params.get('depth')))
            return True

        # 支持域名过滤url
        domain = urlparse(url).netloc
        for allowed_domain in self.params.get('allowed_domains'):
            if not fnmatch(domain, allowed_domain):
                return True

        # 支持排除关键字过滤url
        for keyword in self.params.get('exclude_keywords'):
            if keyword in url:
                return True

        # 过滤重复url
        if url in self.all_urls:
            return True
        else:
            self.all_urls.add(url)

        return False

    def parse_form_data(self, tag):
        data = {}
        for input in tag.find_all('input'):
            name = input.get('name')
            if name and input.get('type') in ['text', 'password']:
                data[name] = input.get('value', '')
            elif input.get('type') == 'submit':
                name = 'submit'
                data[name] = input.get('value', '')
            else:
                if data.get(name) is None:
                    data[name] = list(input.get('value', ''))
                else:
                    data[name].append(input.get('value', ''))
        return data

    async def abstract_urls(self, response):
        url_items = []
        url_item = response.get('url_item')
        resp = response.get('response')
        soup = BeautifulSoup(resp.content, 'lxml')
        tags = soup.find_all(True)
        for tag in tags:
            if self.stop_parse:
                logging.info('Stop abstract urls.')
                break
            method, data = 'GET', None
            if tag.name == 'form':
                url = tag.get('action', '')
                method = tag.get('method')
                data = self.parse_form_data(tag)
            elif tag.name == 'script':
                url = tag.get('src', '')
            else:
                url = tag.get('href', '')
            sub_url_item = self.manager.patch_url(
                url, method=method, data=data, parent_url_obj=url_item)
            if not self.filter_url(sub_url_item):
                url_items.append(sub_url_item)
                self.parse_url_list.append(sub_url_item.get('url'))
            else:
                self.all_passed_urls += 1
        return url_items

    async def parse(self, response):
        url_items = []
        if response and not self.stop_parse:
            url_items = await self.abstract_urls(response)
        self.all_abstract_urls += len(url_items)
        return url_items


class DataItem(object):

    def __init__(self, params):
        self.params = params
        db = connect_mongo(MONGODB)
        self.movie = db['my_crawler_urls']
        self.error_urls = 0
        # 入库前再清洗一遍相同的url
        self.res_urls = set()
        self.duplicate_urls = 0

    def handle_url(self, url):
        url_obj = urlparse(url)
        port = 443 if url_obj.scheme == 'https' else 80
        if ':80' in url_obj.netloc or ':443' in url_obj.netloc:
            new_netloc = url_obj.netloc
        else:
            new_netloc = '{}:{}'.format(url_obj.netloc, port)

        if url_obj.params:
            new_params = '?{}'.format(url_obj.params)
        else:
            new_params = url_obj.params

        if url_obj.query:
            new_query = '?{}'.format(url_obj.query)
        else:
            new_query = url_obj.query

        new_url = '{}://{}{}{}{}{}'.format(url_obj.scheme, new_netloc,
                                           url_obj.path, new_params,
                                           new_query, url_obj.fragment)
        return new_url

    def handle_title(self, data):
        title = ''
        response = data.get('response')
        html = response.content
        if html:
            soup = BeautifulSoup(html, 'lxml')
            title_tag = soup.title
            title = title_tag.get_text() if title_tag else ''
        return title

    def handle_data(self, data):
        response = data.get('response')
        request, resp = {}, {}
        request['headers'] = response.request.headers
        request['url'] = response.request.url
        request['method'] = response.request.method

        resp['headers'] = response.headers
        resp['content'] = response.content
        resp['status_code'] = response.status_code
        url = self.handle_url(request['url'])
        resp['url'] = url
        if url not in self.res_urls:
            self.res_urls.add(url)
            return request, resp
        else:
            self.duplicate_urls += 1
            return None

    async def save(self, response):
        doc = {}
        url_item = response.get('url_item')
        try:
            data = self.handle_data(response)
            if data:
                request, resp = data[0], data[1]
                title = self.handle_title(response)
                doc['request'] = request
                doc['response'] = resp
                doc['title'] = title
                doc['site'] = self.params.get('start_url')
                doc['end_type'] = self.params.get('end_type')
                doc['time'] = int(time.time() * 1000)
                self.movie.insert(doc)
        except Exception as e:
            logging.error('Save data to mongodb error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1

if __name__ == '__main__':
    init_root_logger_settings()
    spider = Spider(PARAMS)
    spider.crawl()

05/08/2018 11:34:49 [INFO] [spiders.py] [199]: Spider started!
05/08/2018 11:34:49 [INFO] [spiders.py] [165]: Start to create consume and produce tasks.
05/08/2018 11:34:52 [INFO] [spiders.py] [336]: Current url amount 541 > 500, stop parse urls.
05/08/2018 11:34:52 [INFO] [spiders.py] [389]: Stop abstract urls.
05/08/2018 11:34:57 [ERROR] [spiders.py] [292]: Download html error: HTTPConnectionPool(host='pcdoodle.baidu.com', port=80): Max retries exceeded with url: /doodlebaike (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001BF9238C7B8>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))
05/08/2018 11:34:57 [ERROR] [spiders.py] [293]: Error url info: {'base_url': 'https://www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&tn=SE_PclogoS_8whnvm25&sa=ire_dl_gh_logo&rsv_dl=igh_logo_pcs', 'depth': 3, 'url': 'http://www.baidu.com/link?url=MuTGTEGFr3teHcbL6WVBjAGg6N-DnpZr1ThrJuKFyI_xrPUcZCfH_9j_nxdJmJJr', 'domain': 'www.baidu.com', 'end_type': 'PC', 'method': 'GET', 'data': None}
05/08/2018 11:35:59 [ERROR] [spiders.py] [496]: Save data to mongodb error: HTTPConnectionPool(host='shouji.baidu.com', port=80): Read timed out.
05/08/2018 11:35:59 [ERROR] [spiders.py] [497]: Error url info: {'base_url': 'https://www.baidu.com/s?wd=%E7%99%BE%E5%BA%A6%E5%A5%BD%E7%9C%8B&tn=SE_PclogoS_8whnvm25&usm=3&ie=utf-8&rsv_cq=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&rsv_dl=0_right_recommends_merge_20826&euri=6b0794120dec11e68e38008cfaeb7e18', 'depth': 4, 'url': 'http://www.baidu.com/link?url=qU-7h5sRYKczMwPlGECptMaN1iFtVxw7dpQI-pBMIEIfsMDWI29kc42vFzbc7HQQ_F8Zi8a8GD2CFF6ug4UmYK', 'domain': 'www.baidu.com', 'end_type': 'PC', 'method': 'GET', 'data': None}
05/08/2018 11:37:07 [INFO] [spiders.py] [212]: Request num: 535.
05/08/2018 11:37:07 [INFO] [spiders.py] [213]: Response num: 752.
05/08/2018 11:37:07 [INFO] [spiders.py] [214]: History num: 218.
05/08/2018 11:37:07 [INFO] [spiders.py] [215]: Abstract urls: 541.
05/08/2018 11:37:07 [INFO] [spiders.py] [216]: Filters urls: 6421.
05/08/2018 11:37:07 [INFO] [spiders.py] [217]: Error urls: 2.
05/08/2018 11:37:07 [INFO] [spiders.py] [218]: All urls: 542.
05/08/2018 11:37:07 [INFO] [spiders.py] [219]: Time usage: 0:02:18.153237
05/08/2018 11:37:07 [INFO] [spiders.py] [220]: Spider finished!
05/08/2018 11:37:07 [INFO] [spiders.py] [221]: Create 1341 tasks