In [4]:
import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode #用于在网址链接中加入参数
import json
from bs4 import BeautifulSoup
import re

In [5]:
def get_page_index(offset, keyword):
    '''
    Desc:
        抓取通过offset实现Ajax异步加载的URL，返回当前offset的内容
    param:
        offset -- 为了实现Ajax异步加载，使用offset实现动态分页
        keyword -- 搜索用的关键词
    return:
        res.text -- 当前offset的requests.text内容
    '''
    url_param ={
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 1,
        'from': 'search_tab'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(url_param)
    try:
        res = requests.get(url)
        res.encoding = 'utf-8'
        #返回的状态码是整型
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        print("请求索引页面出错")
        return None

In [6]:
def parse_page_index(html):
    '''
    Desc:
        解析当前offset的requests.text内容
    param:
        html -- 当前offset的res.text内容
    return:
        item.get('article_url') -- 组图中的文章链接
    '''
    data = json.loads(html) #将网页的js数据格式转换为json
    if data and 'data' in data.keys():
        for item in data.get('data'):
            if item.get('article_url') != None:
                yield item.get('article_url')

In [7]:
def get_page_detail(url):
    '''
    Desc:
        获取每一个组图详情页连接的内容
    param:
        url -- 组图详情页的链接
    return:
        res.text -- 每一个组图详情页链接的res.text内容
    '''
    try:
        headers = {
            'user-agent':'Mozilla/5.0'
        }
        res = requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        #返回的状态码是整型
        if res.status_code == 200:
            return res.text
        return None
    except RequestException:
        print("请求详情页面出错")
        return None

In [21]:
def parse_page_detail(html, url):
    html = get_page_detail('https://www.toutiao.com/a6578799143752303112/')
    soup = BeautifulSoup(html, 'lxml')
    title = soup.select('title')[0].text
    img_pattern = re.compile('gallery: JSON\\.parse\\("(.*?)"\\),', re.S)
    result = re.search(img_pattern, html)
    if result != None:
        data = (result.group(1))
        #由于json要求键值都必须时双引号的字符串，而且这里做了反爬虫处理，因此，我们要删除多余的转义字符\\
        data = data.replace("\\","") 
        #将去除多余的转义字符的字符串转换为json格式的字典存储
        data = json.loads(data)
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            return {
                'title': title, #街拍详情页的标题
                'images':images, #街拍详情页的组图图片
                'url':url #街拍详情页的网页链接
            }

In [22]:
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
    html = get_page_detail(url)
    if html:
        result = parse_page_detail(html, url)
        print(result)
    
    

{'title': '路人街拍，夏天就要穿黑色，让你又美又干练！', 'images': ['http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e', 'http://p9.pstatp.com/origin/pgc-image/15317459529033cc9d86194', 'http://p9.pstatp.com/origin/pgc-image/153174595225483b078b96b', 'http://p9.pstatp.com/origin/pgc-image/1531745953031dbe187e064', 'http://p3.pstatp.com/origin/pgc-image/1531745953035b987f1502b'], 'url': 'http://toutiao.com/group/6578797336615453197/'}
请求详情页面出错
{'title': '路人街拍，夏天就要穿黑色，让你又美又干练！', 'images': ['http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e', 'http://p9.pstatp.com/origin/pgc-image/15317459529033cc9d86194', 'http://p9.pstatp.com/origin/pgc-image/153174595225483b078b96b', 'http://p9.pstatp.com/origin/pgc-image/1531745953031dbe187e064', 'http://p3.pstatp.com/origin/pgc-image/1531745953035b987f1502b'], 'url': 'http://toutiao.com/group/6578806473885745678/'}
{'title': '路人街拍，夏天就要穿黑色，让你又美又干练！', 'images': ['http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e', 'http://p9.pstatp.com/or

In [16]:
html = get_page_detail('https://www.toutiao.com/a6578799143752303112/')
soup = BeautifulSoup(html, 'lxml')
title = soup.select('title')[0].text
img_pattern = re.compile('gallery: JSON\\.parse\\("(.*?)"\\),', re.S)
result = re.search(img_pattern, html)

data = (result.group(1))
#由于json要求键值都必须时双引号的字符串，而且这里做了反爬虫处理，因此，我们要删除多余的转义字符\\
data = data.replace("\\","") 
data = json.loads(data)
if data and 'sub_images' in data.keys():
    sub_images = data.get('sub_images')
    images = [item.get('url') for item in sub_images]

In [17]:
images

['http://p3.pstatp.com/origin/pgc-image/15317459528784e9ce3883e',
 'http://p9.pstatp.com/origin/pgc-image/15317459529033cc9d86194',
 'http://p9.pstatp.com/origin/pgc-image/153174595225483b078b96b',
 'http://p9.pstatp.com/origin/pgc-image/1531745953031dbe187e064',
 'http://p3.pstatp.com/origin/pgc-image/1531745953035b987f1502b']