# 全站爬虫

### 功能介绍

1. 支持深度控制

In [None]:
# -*- coding:utf-8 -*-
import os
import re
import time
import logging
import logging.handlers
from pymongo import MongoClient
import gevent
from gevent import monkey, pool
from gevent.queue import Queue
from urllib.parse import urlparse, urljoin
from html import unescape
import requests
from bs4 import BeautifulSoup
from pprint import pprint
monkey.patch_socket()

# 爬虫参数
PARAMS = {
    # 起始url
    'start_url': 'https://www.baidu.com',
    #
    'start_url_request_method': 'GET',
    # url类型:
    'end_type': 'PC',
    # 并发数
    'concurrency': 5,
    # 爬取深度, if depth is not positive, then no depth limit
    'depth': -1,
    # 单个HTML的Content-Length为1M
    'content_length': 1 * 1024 * 1024,
    # 整站爬取总量限制, if amount is not positive, then no amount limit to urls
    'amount': 500,
    # 网络请求超时
    'timeout': 5,
    # 队列取数据超时
    'queue_timeout': 1,
    # 爬取延时
    'delay': -1,
    # 队列大小
    'queue_size': 500,
    # 域名允许控制
    'allowed_domains': ['www.baidu.com'],
    # 关键字排除
    'exclude_keywords': [],
    # 支持cookie
    'cookies': {}
}

HEADER = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate'
}


# MONGODB参数
MONGODB = {
    "user": "xxxx",
    "passwd": "xxxx",
    "host": "127.0.0.1:27017",
    "dbname": "xxxxx"
}


def init_root_logger_settings(log_name='spiders', logConsole=True):
    LOG_FORMAT = "%(asctime)s [%(levelname)s] [%(filename)s] [%(lineno)d]: %(message)s"
    log_dir = os.path.join(os.path.dirname(
        os.path.dirname(os.path.abspath(__file__))), "logs")
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt=LOG_FORMAT, datefmt="%m/%d/%Y %H:%M:%S")

    fh = logging.handlers.TimedRotatingFileHandler(filename=os.path.join(log_dir, log_name),
                                                   when='midnight', interval=1, encoding='utf-8')
    fh.setLevel(logging.INFO)
    fh.suffix = "%Y-%m-%d.log"
    fh.setFormatter(formatter)
    root_logger.addHandler(fh)

    if logConsole:
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        root_logger.addHandler(ch)


def connect_mongo(MONGODB):
    # 少一个参数
    client = MongoClient(
        'mongodb://{}:{}@{}/{}'.format(MONGODB['user'],
                                       MONGODB['passwd'],
                                       MONGODB['host'],
                                       MONGODB['dbname']))
    return client[MONGODB['dbname']]


class Spider(object):

    def __init__(self, params):
        self.init_spider_params(params)
        self.manager = UrlManager(self.params)
        self.downloader = HtmlDownloader(self.params)
        self.parser = HtmlParser(self.params)
        self.data_item = DataItem(self.params)
        self.urlQ = Queue(maxsize=self.params.get('queue_size'))
        self.respQ = Queue(maxsize=self.params.get('queue_size'))
        self.request_num = 0
        self.response_num = 0

    def init_spider_params(self, params):
        self.params = PARAMS
        if params:
            self.params.update(params)

    def is_running(self):
        if self.parser.stop_parse:
            if not self.urlQ.empty() or not self.respQ.empty():
                is_running = True
            else:
                is_running = False
        else:
            is_running = True
        return is_running

    def init_task(self):
        logging.info('Init first task.')
        start_url = self.params.get('start_url')
        url_item = self.manager.patch_url(start_url)
        self.parser.all_urls.add(url_item.get('url'))
        self.urlQ.put_nowait(url_item)

    def consume_task(self):
        try:
            url_item = self.urlQ.get(timeout=self.params.get('queue_timeout'))
            self.request_num += 1
            resp_list = self.downloader.get_response(url_item)
            for response in resp_list:
                self.respQ.put_nowait(response)
        except Exception as e:
            pass
            # logging.warning('Get url from queue time out.')

    def produce_task(self):
        try:
            response = self.respQ.get(timeout=self.params.get('queue_timeout'))
            self.response_num += 1
            url_items = self.parser.parse(response)
            for url_item in url_items:
                if not self.parser.stop_parse:
                    self.urlQ.put_nowait(url_item)
                else:
                    break
            self.data_item.save(response)
        except Exception as e:
            pass
            # logging.warning('Get response from queue time out.')

    def crawl(self):
        tp = pool.Pool(50)
        logging.info('Spider start crawl.')
        tp.add(gevent.spawn(self.init_task))
        corutine_num = 1
        while self.is_running():
            try:
                if self.urlQ.qsize() == self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                    corutine_num += 2
                elif self.urlQ.qsize() > self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                    corutine_num += 3
                else:
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                    tp.add(gevent.spawn(self.produce_task))
                    corutine_num += 3
            except Exception as e:
                logging.error('corutine error: {}'.format(e))
        tp.join()
        logging.info('Create {} corutines.'.format(corutine_num))
        logging.info('Request num: {}.'.format(self.request_num))
        logging.info('Response num: {}.'.format(self.response_num))
        logging.info('History num: {}.'.format(self.downloader.history_num))
        logging.info('Abstract urls: {}.'.format(
            self.parser.all_abstract_urls))
        logging.info('Filters urls: {}.'.format(self.parser.all_passed_urls))
        logging.info('Error urls: {}.'.format(self.downloader.error_urls))
        logging.info('All urls: {}.'.format(len(self.parser.all_urls)))
        logging.info('Spider closed.')


class UrlManager(object):

    def __init__(self, params):
        self.params = params

    def normal_url(self, url, base_url):
        new_url = unescape(url.strip())
        if not re.match('(http|https)://', new_url):
            new_url = urljoin(base_url, new_url)
        return new_url[:-1] if new_url.endswith('/') else new_url

    def patch_url(self, url, method='GET', data=None, parent_url_obj=None):
        url_item = {}
        url_item['base_url'] = parent_url_obj.get(
            'url') if parent_url_obj else ''
        url_item['depth'] = parent_url_obj.get(
            'depth') + 1 if parent_url_obj else 1
        url_item['url'] = self.normal_url(url, url_item.get('base_url'))
        url_item['domain'] = urlparse(url_item.get('url')).netloc
        url_item['end_type'] = self.params.get('end_type')
        url_item['method'] = method
        url_item['data'] = data

        return url_item


class HtmlDownloader(object):

    def __init__(self, params):
        self.params = params
        self.error_urls = 0
        self.history_num = 0

    def get_headers(self):
        return HEADER

    def get_proxy(self):
        return self.params.get('proxy')

    def get_cookies(self):
        return self.params.get('cookies')

    def get_timeout(self):
        return self.params.get('timeout')

    def download(self, url_item):
        url, method, data = url_item.get('url'), url_item.get(
            'method'), url_item.get('data')
        try:
            if method == 'GET':
                response = requests.get(url, headers=self.get_headers(),
                                        proxies=self.get_proxy(), stream=True,
                                        cookies=self.get_cookies(),
                                        timeout=self.get_timeout())
            else:
                response = requests.post(url, headers=self.get_headers(),
                                         proxies=self.get_proxy(), data=data,
                                         stream=True,
                                         cookies=self.get_cookies(),
                                         timeout=self.get_timeout())
            return response
        except Exception as e:
            logging.error('Download html error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1

    def get_response(self, url_item):
        resp_list = []
        response = self.download(url_item)
        if response:
            resp = {}
            resp['url_item'] = url_item
            resp['response'] = response
            resp_list.append(resp)
            for history_item in response.history:
                resp = {}
                resp['url_item'] = url_item
                resp['response'] = history_item
                resp_list.append(resp)
                self.history_num += 1
        return resp_list


class HtmlParser(object):

    def __init__(self, params):
        self.params = params
        self.manager = UrlManager(self.params)
        # 所有不重复url的集合,包括不符合过滤条件的url,该集合仅用来过滤重复urls,并不是实际请求的urls数目
        self.all_urls = set()
        # 停止解析urls
        self.stop_parse = False
        # 被过滤掉的urls数目
        self.all_passed_urls = 0
        # 提取到的urls数目
        self.all_abstract_urls = 0

    def filter_url(self, url_item):
        url = url_item.get('url')
        depth = url_item.get('depth')
        amount = self.params.get('amount')
        # 如果使用总量限制,且请求url超出总量设置,则过滤并停止提取url
        if amount > 0 and self.all_abstract_urls > amount:
            self.stop_parse = True
            logging.info('Current url amount {} > {}, stop parse urls.'.format(
                self.all_abstract_urls, self.params.get('amount')))
            return True

        # 过滤重复url
        if url in self.all_urls:
            return True
        else:
            self.all_urls.add(url)

        # 支持url深度限制
        if self.params.get('depth') > 0 and depth > self.params.get('depth'):
            self.stop_parse = True
            logging.info('Current url depth {} > {}, stop parse urls.'.format(
                depth, self.params.get('depth')))
            return True

        # 支持域名过滤url
        domain = urlparse(url).netloc
        allowed_domains = self.params.get('allowed_domains')
        if allowed_domains and domain not in allowed_domains:
            return True

        # 支持排除关键字过滤url
        for keyword in self.params.get('exclude_keywords'):
            if keyword in url:
                return True

        return False

    def parse_form_data(self, tag):
        data = {}
        for input in tag.find_all('input'):
            name = input.get('name')
            if name and input.get('type') in ['text', 'password']:
                data[name] = input.get('value', '')
            elif input.get('type') == 'submit':
                name = 'submit'
                data[name] = input.get('value', '')
            else:
                if data.get(name) is None:
                    data[name] = list(input.get('value', ''))
                else:
                    data[name].append(input.get('value', ''))
        return data

    def abstract_urls(self, response):
        url_items = []
        url_item = response.get('url_item')
        resp = response.get('response')
        soup = BeautifulSoup(resp.text, 'lxml')
        tags = soup.find_all(True)
        for tag in tags:
            if self.stop_parse:
                logging.info('Stop abstract urls.')
                break
            method, data = 'GET', None
            if tag.name == 'form':
                url = tag.get('action', '')
                method = tag.get('method')
                data = self.parse_form_data(tag)
            elif tag.name == 'script':
                url = tag.get('src', '')
            else:
                url = tag.get('href', '')
            sub_url_item = self.manager.patch_url(
                url, method=method, data=data, parent_url_obj=url_item)
            if not self.filter_url(sub_url_item):
                url_items.append(sub_url_item)
            else:
                self.all_passed_urls += 1
        return url_items

    def parse(self, response):
        url_items = []
        if response and not self.stop_parse:
            url_items = self.abstract_urls(response)
        # else:
        #     logging.warning('Response is None')
        self.all_abstract_urls += len(url_items)
        return url_items


class DataItem(object):

    def __init__(self, params):
        self.params = params
        db = connect_mongo(MONGODB)
        self.movie = db['my_crawler_urls']

    def handle_url(self, url):
        url_obj = urlparse(url)
        port = 443 if url_obj.scheme == 'https' else 80
        if ':80' in url_obj.netloc or ':443' in url_obj.netloc:
            new_netloc = url_obj.netloc
        else:
            new_netloc = '{}:{}'.format(url_obj.netloc, port)

        if url_obj.params:
            new_params = '?{}'.format(url_obj.params)
        else:
            new_params = url_obj.params

        if url_obj.query:
            new_query = '?{}'.format(url_obj.query)
        else:
            new_query = url_obj.query

        new_url = '{}://{}{}{}{}{}'.format(url_obj.scheme, new_netloc,
                                           url_obj.path, new_params,
                                           new_query, url_obj.fragment)
        return new_url

    def handle_title(self, data):
        title = ''
        response = data.get('response')
        html = response.text
        if html:
            soup = BeautifulSoup(html, 'lxml')
            title_tag = soup.title
            title = title_tag.get_text() if title_tag else ''
        return title

    def handle_data(self, data):
        response = data.get('response')
        request, resp = {}, {}
        request['headers'] = response.request.headers
        request['url'] = response.request.url
        request['method'] = response.request.method

        resp['headers'] = response.headers
        resp['content'] = response.content
        resp['status_code'] = response.status_code
        resp['url'] = self.handle_url(request['url'])
        return request, resp

    def save(self, response):
        doc = {}
        try:
            request, resp = self.handle_data(response)
            title = self.handle_title(response)
            doc['request'] = request
            doc['response'] = resp
            doc['title'] = title
            doc['site'] = self.params.get('start_url')
            doc['end_type'] = self.params.get('end_type')
            doc['time'] = int(time.time() * 1000)
            self.movie.insert(doc)
        except Exception as e:
            logging.error('Save data to mongodb error: {}'.format(e))


if __name__ == '__main__':
    init_root_logger_settings()
    spider = Spider(PARAMS)
    spider.crawl()


In [None]:
# -*- coding:utf-8 -*-
import re
import time
import logging
import logging.handlers
from fnmatch import fnmatch
import gevent
from gevent import monkey, pool
from gevent.queue import Queue
from urllib.parse import urlparse, urljoin
from html import unescape
import requests
from bs4 import BeautifulSoup
monkey.patch_socket()

from .config import PARAMS
from .config import get_header
from utils.connectmongo import ConnectMongo

class Spider(object):

    def __init__(self, params):
        assert isinstance(params, dict)
        self.init_spider_params(params)
        self.manager = UrlManager(self.params)
        self.downloader = HtmlDownloader(self.params)
        self.parser = HtmlParser(self.params)
        self.data_item = DataItem(self.params)
        self.urlQ = Queue(maxsize=self.params.get('queue_size'))
        self.respQ = Queue(maxsize=self.params.get('queue_size'))
        self.request_num = 0
        self.response_num = 0

    def init_spider_params(self, params):
        self.params = PARAMS
        if params:
            self.params.update(params)

    def is_running(self):
        if self.parser.stop_parse:
            if not self.urlQ.empty() or not self.respQ.empty():
                is_running = True
            else:
                is_running = False
        else:
            is_running = True
        return is_running

    def init_task(self):
        start_url = self.params.get('start_url')
        url_item = self.manager.patch_url(start_url)
        self.parser.all_urls.add(url_item.get('url'))
        self.urlQ.put_nowait(url_item)

    def consume_task(self):
        try:
            url_item = self.urlQ.get(timeout=self.params.get('queue_timeout'))
            self.request_num += 1
            resp_list = self.downloader.get_response(url_item)
            for response in resp_list:
                self.respQ.put_nowait(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    def produce_task(self):
        try:
            response = self.respQ.get(timeout=self.params.get('queue_timeout'))
            self.response_num += 1
            url_items = self.parser.parse(response)
            for url_item in url_items:
                if not self.parser.stop_parse:
                    self.urlQ.put_nowait(url_item)
                else:
                    break
            self.data_item.save(response)
        except Exception as e:
            if self.urlQ.empty() and self.respQ.empty():
                self.parser.stop_parse = True

    def crawl(self):
        tp = pool.Pool(50)
        logging.info('Spider start crawl.')
        logging.info('Init first task.')
        tp.add(gevent.spawn(self.init_task))
        while self.is_running():
            try:
                if self.urlQ.qsize() == self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                elif self.urlQ.qsize() > self.respQ.qsize():
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                else:
                    tp.add(gevent.spawn(self.consume_task))
                    tp.add(gevent.spawn(self.produce_task))
                    tp.add(gevent.spawn(self.produce_task))
            except Exception as e:
                logging.error('corutine error: {}'.format(e))
        tp.join()
        logging.info('Request num: {}.'.format(self.request_num))
        logging.info('Response num: {}.'.format(self.response_num))
        logging.info('History num: {}.'.format(self.downloader.history_num))
        logging.info('Abstract urls: {}.'.format(self.parser.all_abstract_urls))
        logging.info('Filters urls: {}.'.format(self.parser.all_passed_urls))
        logging.info('Error urls: {}.'.format(self.downloader.error_urls + self.data_item.error_urls))
        logging.info('All urls: {}.'.format(len(self.parser.all_urls)))
        logging.info('Spider closed.')


class UrlManager(object):

    def __init__(self, params):
        self.params = params

    def normal_url(self, url, base_url):
        new_url = unescape(url.strip())
        if not re.match('(http|https)://', new_url):
            new_url = urljoin(base_url, new_url)
        return new_url[:-1] if new_url.endswith('/') else new_url

    def patch_url(self, url, method='GET', data=None, parent_url_obj=None):
        url_item = {}
        url_item['base_url'] = parent_url_obj.get(
            'url') if parent_url_obj else ''
        url_item['depth'] = parent_url_obj.get(
            'depth') + 1 if parent_url_obj else 1
        url_item['url'] = self.normal_url(url, url_item.get('base_url'))
        url_item['domain'] = urlparse(url_item.get('url')).netloc
        url_item['end_type'] = self.params.get('end_type')
        url_item['method'] = method
        url_item['data'] = data

        return url_item


class HtmlDownloader(object):

    def __init__(self, params):
        self.params = params
        self.error_urls = 0
        self.history_num = 0

    def get_headers(self):
        return get_header(self.params.get('end_type'))

    def get_proxy(self):
        return self.params.get('proxy')

    def get_cookies(self):
        return self.params.get('cookies')

    def get_timeout(self):
        return self.params.get('timeout')

    def download(self, url_item):
        url, method, data = url_item.get('url'), url_item.get(
            'method'), url_item.get('data')
        try:
            if method == 'GET':
                response = requests.get(url, headers=self.get_headers(),
                                        proxies=self.get_proxy(), stream=True,
                                        cookies=self.get_cookies(),
                                        timeout=self.get_timeout())
            else:
                response = requests.post(url, headers=self.get_headers(),
                                         proxies=self.get_proxy(), data=data,
                                         stream=True,
                                         cookies=self.get_cookies(),
                                         timeout=self.get_timeout())
            # 过滤content-length大于1M的html下载链接
            if int(response.headers.get('Content-Length', 0)) < self.params.get('content_length'):
                return response
            else:
                logging.warning('Content length > 1M, url: {}'.format(url))
            return response
        except Exception as e:
            logging.error('Download html error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1

    def get_response(self, url_item):
        resp_list = []
        response = self.download(url_item)
        if response:
            resp = {}
            resp['url_item'] = url_item
            resp['response'] = response
            resp_list.append(resp)
            for history_item in response.history:
                resp = {}
                resp['url_item'] = url_item
                resp['response'] = history_item
                resp_list.append(resp)
                self.history_num += 1
        return resp_list


class HtmlParser(object):

    def __init__(self, params):
        self.params = params
        self.manager = UrlManager(self.params)
        # 所有不重复url的集合,包括不符合过滤条件的url,该集合仅用来过滤重复urls,并不是实际请求的urls数目
        self.all_urls = set()
        # 停止解析urls
        self.stop_parse = False
        # 被过滤掉的urls数目
        self.all_passed_urls = 0
        # 提取到的urls数目
        self.all_abstract_urls = 0
        self.parse_url_list = []

    def filter_url(self, url_item):
        url = url_item.get('url')
        depth = url_item.get('depth')
        amount = self.params.get('amount')
        # 如果使用总量限制,且请求url超出总量设置,则过滤并停止提取url
        if amount > 0 and self.all_abstract_urls > amount:
            self.stop_parse = True
            logging.info('Current url amount {} > {}, stop parse urls.'.format(
                self.all_abstract_urls, self.params.get('amount')))
            return True

        # 支持url深度限制
        if self.params.get('depth') > 0 and depth > self.params.get('depth'):
            self.stop_parse = True
            logging.info('Current url depth {} > {}, stop parse urls.'.format(
                depth, self.params.get('depth')))
            return True

        # 支持域名过滤url
        domain = urlparse(url).netloc
        for allowed_domain in self.params.get('allowed_domains'):
            if not fnmatch(domain, allowed_domain):
                return True

        # 支持排除关键字过滤url
        for keyword in self.params.get('exclude_keywords'):
            if keyword in url:
                return True

        # 过滤重复url
        if url in self.all_urls:
            return True
        else:
            self.all_urls.add(url)

        return False

    def parse_form_data(self, tag):
        data = {}
        for input in tag.find_all('input'):
            name = input.get('name')
            if name and input.get('type') in ['text', 'password']:
                data[name] = input.get('value', '')
            elif input.get('type') == 'submit':
                name = 'submit'
                data[name] = input.get('value', '')
            else:
                if data.get(name) is None:
                    data[name] = list(input.get('value', ''))
                else:
                    data[name].append(input.get('value', ''))
        return data

    def abstract_urls(self, response):
        url_items = []
        url_item = response.get('url_item')
        resp = response.get('response')
        soup = BeautifulSoup(resp.content, 'lxml')
        tags = soup.find_all(True)
        for tag in tags:
            if self.stop_parse:
                logging.info('Stop abstract urls.')
                break
            method, data = 'GET', None
            if tag.name == 'form':
                url = tag.get('action', '')
                method = tag.get('method')
                data = self.parse_form_data(tag)
            elif tag.name == 'script':
                url = tag.get('src', '')
            else:
                url = tag.get('href', '')
            sub_url_item = self.manager.patch_url(
                url, method=method, data=data, parent_url_obj=url_item)
            if not self.filter_url(sub_url_item):
                url_items.append(sub_url_item)
                self.parse_url_list.append(sub_url_item.get('url'))
            else:
                self.all_passed_urls += 1
        return url_items

    def parse(self, response):
        url_items = []
        if response and not self.stop_parse:
            url_items = self.abstract_urls(response)
        self.all_abstract_urls += len(url_items)
        return url_items


class DataItem(object):

    def __init__(self, params):
        self.params = params
        db = ConnectMongo().db
        self.movie = db['crawler_urls']
        self.error_urls = 0
        # 入库前再清洗一遍相同的url
        self.res_urls = set()
        self.duplicate_urls = 0

    def handle_url(self, url):
        url_obj = urlparse(url)
        port = 443 if url_obj.scheme == 'https' else 80
        if ':80' in url_obj.netloc or ':443' in url_obj.netloc:
            new_netloc = url_obj.netloc
        else:
            new_netloc = '{}:{}'.format(url_obj.netloc, port)

        if url_obj.params:
            new_params = '?{}'.format(url_obj.params)
        else:
            new_params = url_obj.params

        if url_obj.query:
            new_query = '?{}'.format(url_obj.query)
        else:
            new_query = url_obj.query

        new_url = '{}://{}{}{}{}{}'.format(url_obj.scheme, new_netloc,
                                           url_obj.path, new_params,
                                           new_query, url_obj.fragment)
        return new_url

    def handle_title(self, data):
        title = ''
        response = data.get('response')
        html = response.content
        if html:
            soup = BeautifulSoup(html, 'lxml')
            title_tag = soup.title
            title = title_tag.get_text() if title_tag else ''
        return title

    def handle_data(self, data):
        response = data.get('response')
        request, resp = {}, {}
        request['headers'] = response.request.headers
        request['url'] = response.request.url
        request['method'] = response.request.method

        resp['headers'] = response.headers
        resp['content'] = response.content
        resp['status_code'] = response.status_code
        url = self.handle_url(request['url'])
        resp['url'] = url
        if url not in self.res_urls:
            self.res_urls.add(url)
            return request, resp
        else:
            self.duplicate_urls += 1
            return None


    def save(self, response):
        doc = {}
        url_item = response.get('url_item')
        try:
            data = self.handle_data(response)
            if data:
                request, resp = data[0], data[1]
                title = self.handle_title(response)
                doc['request'] = request
                doc['response'] = resp
                doc['title'] = title
                doc['site'] = self.params.get('start_url')
                doc['end_type'] = self.params.get('end_type')
                doc['time'] = int(time.time() * 1000)
                self.movie.insert(doc)
        except Exception as e:
            logging.error('Save data to mongodb error: {}'.format(e))
            logging.error('Error url info: {}'.format(url_item))
            self.error_urls += 1


# if __name__ == '__main__':
#     for i in range(5):
#         spider = Spider(PARAMS)
#         spider.crawl()
#         time.sleep(5)
