In [27]:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time 
import random
import json
import os

In [29]:
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E8%B4%B8%E6%98%93%E6%88%98',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
    'X-Requested-With': 'XMLHttpRequest',
    'MWeibo-Pwa': '1'
}
save_dir = './data'

In [104]:
def fetch(url, headers=headers):
    try:
        response = requests.get(url, headers=headers)
#         if response.status_code == 200:
        return response.json()
        
    except Exception as e:
        print('Error:', e)

In [111]:
def save_json(result, title):
    path = os.path.join(save_dir, '%s.json'%title)
    with open(path, 'wb') as f:
        f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'))
        f.write('\r\n'.encode('utf-8'))
    print('save file at', path)

In [44]:
def copy_dict(ori, keys):
    dest = {}
    for key in keys:
        dest[key] = ori.get(key)
    return dest

In [101]:
def process_user(user):
    return copy_dict(user, ['id', 'verified', 'verified_type', 'gender', 'followers_count', 'follow_count', 'verified_type_ext', 'screen_name'])

In [113]:
def process_comment(comment):
    ret = copy_dict(comment, ['like_counts', 'created_at', 'text'])
    ret['user'] = process_user(comment.get('user'))
    return ret

In [46]:
def process_item(item):
    
    assert item.get('card_type') == 9
    mblog = item.get('mblog')
    ret = copy_dict(mblog, ['id', 'created_at'])
    
    text = mblog.get('text')
    if 'longText' in mblog:
        text = mblog.get('longText').get('longTextContent')
    ret['text'] = text
    
    user_origin = mblog.get('user')
    ret['user'] = process_user(user_origin)
    
    return ret

In [32]:
def is_res_valid(res):
    return res is not None and res.get('ok') == 1

In [50]:
def get_weibos(query, page=''):
    params = {
        'containerid': '100103type=1&q=%s'%query,
        'page_type': 'searchall',
        'page': page
    }
    url = base_url + urlencode(params)
    print(url)
    return fetch(url)

In [108]:
def get_comments(id, start_page=1):
    ret = []
    base_url = 'https://m.weibo.cn/api/comments/show?'
    page = start_page
    while True:
        params = {
        'id': id,
        'page': page
        }
        url = base_url + urlencode(params)
        
        response_json = fetch(url)
        if not is_res_valid(response_json):
            break
        
        data = response_json.get('data').get('data')
        for comment in data:
            ret.append(process_comment(comment))
            
        page += 1
        time.sleep(random.randint(0, 2000)/1000)
    
    return ret

In [96]:
def collect_weibo(query, max_page=10, save=False, start_page=1):
    ret = []
    start_page = max(start_page, 1)
    if save and not os.path.isdir(os.path.join(save_dir, query)):
        os.mkdir(os.path.join(save_dir, query))
    for page in range(start_page, max_page+1):
        response_json = get_weibos(query, page)
        
        if is_res_valid(response_json):
            cards = response_json.get('data').get('cards', {})
            
            for card in cards:
                
                if 'card_group' not in card: continue
                for item in card['card_group']:
                    if item.get('card_type') != 9: continue
                        
                    weibo = process_item(item)
                    comments = get_comments(weibo.get('id'))
#                     print((comments))
                    weibo['comments'] = comments
                    
                    ret.append(weibo)
                
            time.sleep(random.randint(1, 3))
        else:
            print('Result invalid:')
            print('Query: %s, page: %d'%(page, query))
            break
            
    print('Fetch %d records'%len(ret))
    if save:
        save_json(ret, '%s(%d-%d)'%(query, start_page, page))
    return ret

In [114]:
res = collect_weibo('中美', 1, start_page=1, save=True)


https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E4%B8%AD%E7%BE%8E&page_type=searchall&page=1
Fetch 13 records
save file at ./data\中美(1-1).json
