In [1]:
import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode #用于在网址链接中加入参数
import json
from bs4 import BeautifulSoup
import re
import os
from hashlib import md5

In [20]:
def get_page_index(offset, keyword):
    '''
    Desc:
        抓取通过offset实现Ajax异步加载的URL，返回当前offset的内容
    param:
        offset -- 为了实现Ajax异步加载，使用offset实现动态分页
        keyword -- 搜索用的关键词
    return:
        res.text -- 当前offset的requests.text内容
    '''
    url_param ={
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 1,
        'from': 'search_tab'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(url_param)
    try:
        res = requests.get(url)
        res.encoding = 'utf-8'
        #返回的状态码是整型
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        print("请求索引页面出错")
        return None

In [21]:
def parse_page_index(html):
    '''
    Desc:
        解析当前offset的requests.text内容
    param:
        html -- 当前offset的res.text内容
    return:
        item.get('article_url') -- 组图中的文章链接
    '''
    data = json.loads(html) #将网页的js数据格式转换为json
    if data and 'data' in data.keys():
        for item in data.get('data'):
            if item.get('article_url') != None:
                yield item.get('article_url')

In [22]:
def get_page_detail(url):
    '''
    Desc:
        获取每一个组图详情页连接的内容
    param:
        url -- 组图详情页的链接
    return:
        res.text -- 每一个组图详情页链接的res.text内容
    '''
    try:
        headers = {
            'user-agent':'Mozilla/5.0'
        }
        res = requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        #返回的状态码是整型
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        print("请求详情页面出错")
        return None

In [26]:
def download_img(url):
    '''
    Desc:
        下载图片到指定的文件file_name中
    param:
        url -- 每一张图片的url
    '''
    try:
        headers = {
            'user-agent':'Mozilla/5.0'
        }
        res = requests.get(url, headers=headers)
#         res.encoding = 'utf-8'
        #返回的状态码是整型
        if res.status_code == 200:
            #在当前路径下存储图片，图片命名为md5的格式+jpg
            file_name = '{0}/{1}.{2}'.format(os.getcwd(), md5(res.content).hexdigest(), 'jpg')
            print("正在下载", url, " ", file_name) #打印当前下载的图片的url
            if not os.path.exists(file_name):
                with open(file_name, 'wb') as f:
                    f.write(res.content) #图片信息为二进制数据，所以为调用content方法
                    f.close() 
        return None
    except RequestException:
        print("下载图片出错")
        return None

In [27]:
def parse_page_detail(html, url):
    '''
    Desc:
        解析组图详情页的内容，返回组图详情页的标题，详情页里面的图片，详情页的url
    param:
        html -- 组图详情页的html内容 
        url -- 组图详情页的链接
    return:
        一个字典 -- 
        {
            'title': title, #街拍详情页的标题
            'images':images, #街拍详情页的组图图片
            'url':url #街拍详情页的网页链接
        }
    '''
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('title')[0].text
    #使用正则表达式提取json数据
    img_pattern = re.compile('gallery: JSON\\.parse\\("(.*?)"\\),', re.S)
    result = re.search(img_pattern, html)
    if result != None:
        data = (result.group(1))
        #由于json要求键值都必须时双引号的字符串，而且这里做了反爬虫处理，因此，我们要删除多余的转义字符\\
        data = data.replace("\\","") 
        #将去除多余的转义字符的字符串转换为json格式的字典存储
        data = json.loads(data)
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images:
                download_img(image)
            return {
                'title': title, #街拍详情页的标题
                'images':images, #街拍详情页的组图图片
                'url':url #街拍详情页的网页链接
            }

In [7]:
html = get_page_detail('https://www.toutiao.com/a6578799143752303112/')
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].text
img_pattern = re.compile('gallery: JSON\\.parse\\("(.*?)"\\),', re.S)
result = re.search(img_pattern, html)

data = (result.group(1))
#由于json要求键值都必须时双引号的字符串，而且这里做了反爬虫处理，因此，我们要删除多余的转义字符\\
data = data.replace("\\","") 
data = json.loads(data)
if data and 'sub_images' in data.keys():
    sub_images = data.get('sub_images')
    images = [item.get('url') for item in sub_images]

In [8]:
images

['http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e',
 'http://p9.pstatp.com/origin/pgc-image/15317459529033cc9d86194',
 'http://p9.pstatp.com/origin/pgc-image/153174595225483b078b96b',
 'http://p9.pstatp.com/origin/pgc-image/1531745953031dbe187e064',
 'http://p3.pstatp.com/origin/pgc-image/1531745953035b987f1502b']

In [28]:
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
    print(url)
    html = get_page_detail(url)
    if html:
        result = parse_page_detail(html, url)

http://toutiao.com/group/6579007595988648456/
正在下载 http://p3.pstatp.com/origin/pgc-image/1531794467973bbf54fa82d   E:\GitWorkstation\python-spider\toutiao_jiepai/c2fa42fcead98af5659e251feb4d731e.jpg
正在下载 http://p9.pstatp.com/origin/pgc-image/1531794476523de14f10abf   E:\GitWorkstation\python-spider\toutiao_jiepai/be1c6be7aa207bc95a6f3826280a0b27.jpg
正在下载 http://p3.pstatp.com/origin/pgc-image/1531794484584af262f3d67   E:\GitWorkstation\python-spider\toutiao_jiepai/db1667e1c8022324d41e03c68e5172f7.jpg
正在下载 http://p1.pstatp.com/origin/pgc-image/1531794491810a2b642d8c0   E:\GitWorkstation\python-spider\toutiao_jiepai/20079342511f64e179acf7c9ded5b2d2.jpg
正在下载 http://p3.pstatp.com/origin/pgc-image/153179450091015c515cf58   E:\GitWorkstation\python-spider\toutiao_jiepai/5d3904f145cb6a66c3d6e5d1c8702e79.jpg
http://toutiao.com/group/6578797336615453197/
正在下载 http://p3.pstatp.com/origin/pgc-image/15317453888010e6e32030c   E:\GitWorkstation\python-spider\toutiao_jiepai/c8e7f54ef8fdba5f5488977a743

正在下载 http://p9.pstatp.com/origin/pgc-image/15317474251436411ea4660   E:\GitWorkstation\python-spider\toutiao_jiepai/5d4e58b9d23b7bc85dc3368f7818cb25.jpg
正在下载 http://p3.pstatp.com/origin/pgc-image/1531747471802a6b588a36c   E:\GitWorkstation\python-spider\toutiao_jiepai/bad543ab8b1bb740b4165c6eaa76a06d.jpg
正在下载 http://p1.pstatp.com/origin/pgc-image/1531747572946c3b28429f0   E:\GitWorkstation\python-spider\toutiao_jiepai/7e1820b6737c699736bf096765938250.jpg
http://toutiao.com/group/6578808536753504782/
正在下载 http://p1.pstatp.com/origin/pgc-image/15317482076131a1d9ee109   E:\GitWorkstation\python-spider\toutiao_jiepai/b0f9aea64c8f9551a667ef84e360f804.jpg
正在下载 http://p3.pstatp.com/origin/pgc-image/153174820759530cd3ea8e2   E:\GitWorkstation\python-spider\toutiao_jiepai/8cbc8652d48dfaaed0e6648687c9bf58.jpg
正在下载 http://p3.pstatp.com/origin/pgc-image/1531748207659d5f19be4b2   E:\GitWorkstation\python-spider\toutiao_jiepai/43a46b471d9a017f74b148c0205d9315.jpg
正在下载 http://p3.pstatp.com/origin/pgc