In [2]:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import time 
import random
import json

In [3]:
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E8%B4%B8%E6%98%93%E6%88%98',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.2 Safari/605.1.15',
    'X-Requested-With': 'XMLHttpRequest',
    'MWeibo-Pwa': '1'
}

In [4]:
max_page = 10

In [5]:
def get_page(page):
    params = {
        'containerid': '100103type=61&q=myz&t=0',
        'page_type': 'searchall',
        'page': page
    }
    url = base_url + urlencode(params)
    # print(url)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json(), page
    except requests.ConnectionError as e:
        print('Error', e.args)

In [6]:
def fix_date(orig):
    if u'前' in orig:
        return '07-29'
    elif u'昨天' in orig:
        return '07-28'
    elif u'前天' in orig:
        return '07-27'
    else:
        return orig

In [7]:
def parse_page(json_data, page: int):
    try:
        items = json_data.get('data').get('cards')[0].get('card_group')
    except:
        items = []
    for index, item in enumerate(items):
        item = item.get('mblog')
        weibo = {}
        weibo['id'] = item.get('id')
        weibo['created_at'] = fix_date(item.get('created_at'))
        weibo['text'] = pq(item.get('text')).text()
        weibo['attitudes'] = item.get('attitudes_count')
        weibo['comments'] = item.get('comments_count')
        weibo['reposts'] = item.get('reposts_count')
        try:
            weibo['origin_text'] = pq(item.get('retweeted_status').get('text')).text()
            weibo['origin_user'] = item.get('retweeted_status').get('user').get('screen_name')
            weibo['origin_created_at'] = fix_date(item.get('retweeted_status').get('created_at'))
        except:
            pass
        try:
            weibo['page_title'] = item.get('page_info').get('page_title')
            weibo['page_url'] = item.get('page_info').get('page_url')
        except:
            pass
        try:
            weibo['long_text'] = pq(item.get('longText').get('longTextContent')).text()
        except:
            pass
        yield weibo

In [8]:
def save_to_file(result):
    with open(result_path, 'ab') as f:
        f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'))
        f.write('\r\n'.encode('utf-8'))

In [9]:
result_path = './data/keyword_raw_0729_3.json'

In [10]:
total = 0
for page in range(2,400):
    json_data = get_page(page)
    results = parse_page(*json_data)
    for result in results:
        save_to_file(result)
        total += 1
    if page % 5 == 0:
        print('Page %d done, total %d weibos'%(page, total))
    time.sleep(random.randint(2, 4))

Page 5 done, total 36 weibos
Page 10 done, total 83 weibos
Page 15 done, total 128 weibos
Page 20 done, total 175 weibos
Page 25 done, total 221 weibos
Page 30 done, total 266 weibos
Page 35 done, total 306 weibos
Page 40 done, total 352 weibos
Page 45 done, total 399 weibos
Page 50 done, total 446 weibos
Page 55 done, total 493 weibos
Page 60 done, total 540 weibos
Page 65 done, total 586 weibos
Page 70 done, total 635 weibos
Page 75 done, total 680 weibos
Page 80 done, total 727 weibos
Page 85 done, total 770 weibos
Page 90 done, total 813 weibos
Page 95 done, total 859 weibos
Page 100 done, total 899 weibos
Page 105 done, total 928 weibos
Page 110 done, total 937 weibos
Page 115 done, total 937 weibos
Page 120 done, total 937 weibos
Page 125 done, total 937 weibos
Page 130 done, total 937 weibos
Page 135 done, total 937 weibos
Page 140 done, total 937 weibos
Page 145 done, total 937 weibos
Page 150 done, total 937 weibos
Page 155 done, total 937 weibos
Page 160 done, total 937 weibo

In [13]:
json_data = get_page(2)

In [14]:
results = parse_page(*json_data)
for result in results:
    print(result)

{'id': '4401120409728409', 'created_at': '07-28', 'text': '比昨天还低myz能不能别打了，等我读完再打行不', 'attitudes': 0, 'comments': 0, 'reposts': 0}
{'id': '4401082136404278', 'created_at': '07-28', 'text': 'myz？', 'attitudes': 0, 'comments': 0, 'reposts': 0, 'origin_text': '一个死也要保持狮子的姿态，一个拼死也要让狮子见证弱者的力量。一场悲剧，两个英雄。happy热哥的秒拍视频', 'origin_user': '鲁国平先生', 'origin_created_at': '08-02'}
{'id': '4401040805746023', 'created_at': '07-28', 'text': '想哭想要抱抱想喝酒 要跟我的myz一起喝醉一次 还有十几天\n其实这个夏天想过那种自己在家一天只吃一个西瓜的日子 减肥还省事 一个人住多爽啊 但是是跟我妈一起生活的假期 我被喂的好肥。。。嗷嗷哭', 'attitudes': 3, 'comments': 0, 'reposts': 0}
{'id': '4400962250835104', 'created_at': '08-02', 'text': '发表了博文《糟心的市场，何时能飞扬？》周末将至，传出了MYZ的利空消息，显然我大A又被带了下节奏。对那些想要满仓甚至加高杠杆赌利好的资金来说，糟心的一周还是结束了，感觉像是被黑天鹅狠狠撞了一下腰。不过糟心的市场，何时能飞扬？', 'attitudes': 0, 'comments': 0, 'reposts': 0, 'page_title': '糟心的市场，何时能飞扬？', 'page_url': 'http://blog.sina.com.cn/s/blog_6031c28f0102yqmz.html?ref=weibocard&from=1110106030&weiboauthoruid=1613873807&sinainternalbrowser=topnav&share_menu=1&luicode=10000011&lf

2

In [25]:
json_data[0].get('data').get('cards')[0].get('card_group')[1].get('mblog')#.get('longText').get('longTextContent')

{'created_at': '昨天 06:00',
 'id': '4401082136404278',
 'idstr': '4401082136404278',
 'mid': '4401082136404278',
 'can_edit': False,
 'show_additional_indication': 0,
 'text': 'myz？',
 'source': '三星android智能手机',
 'favorited': False,
 'pic_types': '',
 'is_paid': False,
 'mblog_vip_type': 0,
 'user': {'id': 6801732679,
  'screen_name': '记忆不同',
  'profile_image_url': 'https://tvax1.sinaimg.cn/crop.56.130.326.326.180/007qjnnNly8fwo6irequzj30c80gbdks.jpg?Expires=1564910984&ssig=cVbyg%2Bw1Z8&KID=imgbed,tva',
  'profile_url': 'https://m.weibo.cn/u/6801732679?uid=6801732679&luicode=10000011&lfid=100103type%3D61%26q%3Dmyz%26t%3D0',
  'statuses_count': 441,
  'verified': False,
  'verified_type': -1,
  'close_blue_v': False,
  'description': '',
  'gender': 'm',
  'mbtype': 0,
  'urank': 9,
  'mbrank': 0,
  'follow_me': False,
  'following': False,
  'followers_count': 577,
  'follow_count': 2492,
  'cover_image_phone': 'https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq

In [60]:
page

294