In [1]:
import requests
import re
from lxml import html
import time

class MyCrawler:
    def __init__(self, filename):
        self.filename = filename
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'
        }

    def download(self, url: str):
        r = requests.get(url, headers=self.headers)
        return r.text

    def extract(self, content, pattern):
        res = re.findall(pattern, content)
        return res

    def save(self, info):
        with open(self.filename, 'a', encoding='utf-8') as f:
            for item in info:
                f.write(''.join(item) + '\n') 

    def crawl(self, url: str, pattern: str, headers=None):
        if headers:
            self.headers.update(headers)
        content = self.download(url)
        info = self.extract(content, pattern)
        self.save(info)

In [2]:
urls = [f'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={start_id}&type=T' for start_id in range(0, 200, 20)]
urls

['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T',
 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T']

In [3]:
%%time

import concurrent.futures
import requests

douban_crawler = MyCrawler('douban.txt')

def load_url(url):
    global douban_crawler
    return douban_crawler.download(url)

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    
    future_to_url = {executor.submit(load_url, url): url for url in urls}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))


'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52972 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52891 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54057 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52794 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54064 bytes
'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53494 bytes
Wall time: 1.02 s
