In [1]:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time 
import random
import json
import os

In [58]:
# Define global config
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E8%B4%B8%E6%98%93%E6%88%98',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
    'X-Requested-With': 'XMLHttpRequest',
    'MWeibo-Pwa': '1'
}
save_dir = './data'

In [59]:
# Request function, with retry
def fetch(url, headers=headers):
    retry_time = 0
    
    while retry_time < 5:
        try:
            response = requests.get(url, headers=headers)
            return response.json()

        except Exception as e:
            print('Retry:', url)
            retry_time += 1
            time.sleep(random.randint(2, 4))
        

In [61]:
# Save JSON in data directory
def save_json(result, title):
    path = os.path.join(save_dir, '%s.json'%title)
    with open(path, 'wb') as f:
        f.write(json.dumps(result, ensure_ascii=False, indent=2).encode('utf-8'))
        f.write('\r\n'.encode('utf-8'))
    print('save file at', path)

In [62]:
# Copy a JSON with some specific keys
def copy_dict(ori, keys):
    dest = {}
    for key in keys:
        dest[key] = ori.get(key)
    return dest

In [63]:
# Extract useful keys from JSON
def process_user(user):
    return copy_dict(user, ['id', 'verified', 'verified_type', 'gender', 'followers_count', 'follow_count', 'verified_type_ext', 'screen_name'])

In [64]:
# Extract useful keys from JSON
def process_comment(comment):
    ret = copy_dict(comment, ['like_counts', 'created_at', 'text'])
    ret['user'] = process_user(comment.get('user'))
    return ret

In [65]:
# Extract weibo information
def process_item(item):
    
    assert item.get('card_type') == 9
    mblog = item.get('mblog')
    ret = copy_dict(mblog, ['id', 'created_at', 'attitudes_count'])
    
    text = mblog.get('text')
    if 'longText' in mblog:
        text = mblog.get('longText').get('longTextContent')
    ret['text'] = text
    
    user_origin = mblog.get('user')
    ret['user'] = process_user(user_origin)
    
    return ret

In [66]:
# Judge if response is valid
def is_res_valid(res):
    return res is not None and res.get('ok') == 1

In [69]:
# Fetch weibo list by query word IN SINGLE PAGE
def get_weibos(query, page=''):
    base_url = 'https://m.weibo.cn/api/container/getIndex?'
    params = {
        'containerid': '100103type=1&q=%s'%query,
        'page_type': 'searchall',
        'page': page
    }
    url = base_url + urlencode(params)
    print(url)
    return fetch(url)

In [68]:
# Fetch comments of a weibo
def get_comments(id, start_page=1):
    ret = []
    base_url = 'https://m.weibo.cn/api/comments/show?'
    page = start_page
    while True:
        params = {
        'id': id,
        'page': page
        }
        url = base_url + urlencode(params)
        
        response_json = fetch(url)
        if not is_res_valid(response_json):
            break
        
        data = response_json.get('data').get('data')
        for comment in data:
            ret.append(process_comment(comment))
            
        page += 1
        time.sleep(random.randint(0, 3))
    
    return ret

In [77]:
# Fetch all weibos with query word, then form an output list
def collect_weibo(query, max_page=10, start_page=1):
    ret = []
    start_page = max(start_page, 1)
    
    for page in range(start_page, max_page+1):
        response_json = get_weibos(query, page)
        comments_in_page = 0
        
        if is_res_valid(response_json):
            cards = response_json.get('data').get('cards', {})
            
            for card in cards:
                
                if 'card_group' not in card: continue
                for item in card['card_group']:
                    if item.get('card_type') != 9: continue
                        
                    weibo = process_item(item)
                    comments = get_comments(weibo.get('id'))
                    
                    weibo['comments'] = comments
                    comments_in_page += len(comments)
                    
                    ret.append(weibo)
            print('Fetch totally %d records. (%d new comments)'%(len(ret), comments_in_page))
            time.sleep(random.randint(1, 3))
        else:
            print('Result invalid:')
            print('Query: %s, page: %d'%(query, page))
            break
            
    return ret

In [72]:
# For each query, output an JSON object
def run(querys, save=True):
    if isinstance(querys, str):
        querys = [querys]
    
    for query in querys:
        print('---------------------------------------')
        print('Fetching "%s"'%query)
        result = collect_weibo(query, 100, start_page=1)
        output = {
            'query': query,
            'result': result
        }

        if save:
            save_json(output, '%s'%(query.replace(' ', '_')))

In [78]:
keywords = [
#     'G20 重启经贸磋商',
#     '推迟 贸易代表莱特希泽 汽车贸易 谈判',
#     '美国进入紧急状态 电信设备 实体名单',
#     '国务院关税税则委员会 提高加征关税税率',
#     '美国贸易代表办公室 3000亿',
#     '美国对中国2000亿美元商品开始加征25%关税',
#     '商务部发言人 深表遗憾 不得不采取必要反制措施',
#     '美国贸易代表办公室宣布对华2000亿美元商品关税从10%提升到25%',
#     '开始对中国2000亿美元的输美商品加征25%关税',
#     '美国贸易代表团应邀访华 第八轮中美经贸高级别磋商',
#     '将推迟提高对华关税税率 并可能计划第二次中美首脑峰会',
    '中方牵头人刘鹤 新一轮中美经贸高级别磋商'
]

In [79]:
run(keywords)

---------------------------------------
Fetching "中方牵头人刘鹤 新一轮中美经贸高级别磋商"
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E4%B8%AD%E6%96%B9%E7%89%B5%E5%A4%B4%E4%BA%BA%E5%88%98%E9%B9%A4+%E6%96%B0%E4%B8%80%E8%BD%AE%E4%B8%AD%E7%BE%8E%E7%BB%8F%E8%B4%B8%E9%AB%98%E7%BA%A7%E5%88%AB%E7%A3%8B%E5%95%86&page_type=searchall&page=1
Fetch totally 9 records. (5 new comments)
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E4%B8%AD%E6%96%B9%E7%89%B5%E5%A4%B4%E4%BA%BA%E5%88%98%E9%B9%A4+%E6%96%B0%E4%B8%80%E8%BD%AE%E4%B8%AD%E7%BE%8E%E7%BB%8F%E8%B4%B8%E9%AB%98%E7%BA%A7%E5%88%AB%E7%A3%8B%E5%95%86&page_type=searchall&page=2
Fetch totally 12 records. (20 new comments)
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E4%B8%AD%E6%96%B9%E7%89%B5%E5%A4%B4%E4%BA%BA%E5%88%98%E9%B9%A4+%E6%96%B0%E4%B8%80%E8%BD%AE%E4%B8%AD%E7%BE%8E%E7%BB%8F%E8%B4%B8%E9%AB%98%E7%BA%A7%E5%88%AB%E7%A3%8B%E5%95%86&page_type=searchall&page=3
Result invalid:
Q

In [49]:
# 测试 run 函数
query = '商务部发言人 深表遗憾 不得不采取必要反制措施'
run(query)


https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%95%86%E5%8A%A1%E9%83%A8%E5%8F%91%E8%A8%80%E4%BA%BA+%E6%B7%B1%E8%A1%A8%E9%81%97%E6%86%BE+%E4%B8%8D%E5%BE%97%E4%B8%8D%E9%87%87%E5%8F%96%E5%BF%85%E8%A6%81%E5%8F%8D%E5%88%B6%E6%8E%AA%E6%96%BD&page_type=searchall&page=1
Fetch 9 records and 5 comments
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%95%86%E5%8A%A1%E9%83%A8%E5%8F%91%E8%A8%80%E4%BA%BA+%E6%B7%B1%E8%A1%A8%E9%81%97%E6%86%BE+%E4%B8%8D%E5%BE%97%E4%B8%8D%E9%87%87%E5%8F%96%E5%BF%85%E8%A6%81%E5%8F%8D%E5%88%B6%E6%8E%AA%E6%96%BD&page_type=searchall&page=2
Fetch 17 records and 6 comments
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%95%86%E5%8A%A1%E9%83%A8%E5%8F%91%E8%A8%80%E4%BA%BA+%E6%B7%B1%E8%A1%A8%E9%81%97%E6%86%BE+%E4%B8%8D%E5%BE%97%E4%B8%8D%E9%87%87%E5%8F%96%E5%BF%85%E8%A6%81%E5%8F%8D%E5%88%B6%E6%8E%AA%E6%96%BD&page_type=searchall&page=3
Result invalid:
Query: 商务部发言人 深表遗憾 不得不采取必要反制措施, page: 3