In [3]:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time 
import random
import json

In [4]:
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E8%B4%B8%E6%98%93%E6%88%98',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
    'X-Requested-With': 'XMLHttpRequest',
    'MWeibo-Pwa': '1'
}

In [13]:
def fetch(url, headers=headers):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except Exception as e:
        print(e)

In [12]:
def query_page(query, page=''):
    params = {
        'containerid': '100103type=1&q=%s'%query,
        'page_type': 'searchall',
        'page': page
    }
    url = base_url + urlencode(params)
    print(url)
    return fetch(url)

In [6]:
def save_json(result, title):
    path = './data/%s.json'%title
    with open(path, 'ab') as f:
        f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'))
        f.write('\r\n'.encode('utf-8'))

In [7]:
def process_item(item):
    ret = {}
    assert item.get('card_type') == 9
    mblog = item.get('mblog')
    id = mblog.get('id') # ?mid
    created_at = mblog.get('created_at')
    text = mblog.get('text')
    if 'longText' in mblog:
        text = mblog.get('longText').get('longTextContent')
    user = mblog.get('user')
    ret['id'] = id
    ret['created_at'] = created_at
    ret['text'] = text
    ret['user'] = user
    return ret

In [14]:
def get_comments(id, page=1):
    base_url = 'https://m.weibo.cn/api/comments/show?'
    params = {
        'id': id,
        'page': page
    }
    url = base_url + urlencode(params)
    return fetch(url)

In [8]:
def collect_weibo(query, max_page=10, save=False):
    ret = []
    all_items.clear()
    invalid_items.clear()
    for page in range(1, max_page+1):
        response_json = query_page(query, page)
        if save: save_json(response_json, '%s-%d'%(query, page))
        if response_json.get('ok') == 1 and response_json.get('data') is not None:
            cards = response_json.get('data').get('cards', {})
            
            for card in cards:
                
                if 'card_group' not in card: continue
                for item in card['card_group']:
#                     all_items.append(item)
                    if item.get('card_type') != 9: continue
                    ret.append(process_item(item))
                    
            time.sleep(1)
    
    print('Fetch %d records'%len(ret))
    return ret

In [10]:
all_items = []
invalid_items = []

In [15]:
res = collect_weibo('塞尔达', 3, save=True)
print(res[0])

https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%A1%9E%E5%B0%94%E8%BE%BE&page_type=searchall&page=1
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%A1%9E%E5%B0%94%E8%BE%BE&page_type=searchall&page=2
https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E5%A1%9E%E5%B0%94%E8%BE%BE&page_type=searchall&page=3
Fetch 32 records
{'id': '4401925803845953', 'created_at': '8小时前', 'text': '塞尔达？？？？//<a href=\'/n/芒果千层塔\'>@芒果千层塔</a>:恐怖//<a href=\'/n/二次元种草机\'>@二次元种草机</a>:点开吓死我<span class="url-icon"><img alt=[允悲] src="//h5.sinaimg.cn/m/emoticon/icon/default/d_yunbei-9aa3c436a4.png" style="width:1em; height:1em;" /></span>马上去看一眼//<a href=\'/n/氪爆\'>@氪爆</a>:草//<a href=\'/n/JimmeYoumu\'>@JimmeYoumu</a> :开塞尔达热死蚂蚁//<a href=\'/n/大隐于世_\'>@大隐于世_</a> : 还行//<a href=\'/n/稀有_幻影骑士团破沙袋\'>@稀有_幻影骑士团破沙袋</a> :草//<a href=\'/n/魔法少年纯洁菌\'>@魔法少年纯洁菌</a> :开塞尔达热死蚂蚁hhhhhhhhhhh', 'user': {'id': 3167305545, 'screen_name': '秋田六千', 'profile_image_url'

In [16]:
test = {
    'arg1': '',
    'arg2': None
}
test.get('asd', 0)

0

In [16]:
print(get_comments('4401925803845953'))

{'ok': 1, 'msg': '数据获取成功', 'data': {'data': [{'id': 4402039712762356, 'created_at': '1小时前', 'source': '', 'user': {'id': 3949612207, 'screen_name': 'Operigo', 'profile_image_url': 'https://tvax1.sinaimg.cn/crop.0.0.512.512.180/eb6a4cafly8g2boq1oh25j20e80e8mxe.jpg?Expires=1565020306&ssig=kcgjUcDT46&KID=imgbed,tva', 'verified': False, 'verified_type': -1, 'followers_count': 1348, 'mbtype': 12, 'profile_url': 'https://m.weibo.cn/u/3949612207?uid=3949612207', 'remark': '', 'following': False, 'follow_me': False}, 'text': "回复<a href='https://m.weibo.cn/n/legolas阿鑫'>@legolas阿鑫</a>:我的妈 这啥玩意", 'reply_id': 4402007773349749, 'reply_text': "<a href='https://m.weibo.cn/n/Operigo'>@Operigo</a>", 'like_counts': 0, 'liked': False}, {'id': 4402022985865527, 'created_at': '2小时前', 'source': '', 'user': {'id': 5240370607, 'screen_name': 'w寿限无寿限无扔屎机w', 'profile_image_url': 'https://tva1.sinaimg.cn/crop.0.0.720.720.180/005IE3Mzjw8eq1zsz3a5uj30k00k0taj.jpg?Expires=1565020306&ssig=9ajBJpuM3U&KID=imgbed,tva',