## 基于POI爬微博

In [14]:
import time
import requests
import pandas as pd

from datetime import datetime
from sqlalchemy import create_engine

engine = create_engine('postgresql://postgres:123@localhost:5432/weibo_crawler')

### 定义函数 

In [15]:
def get_data_page1(json_data):
    '''
    用于获取第一页的数据
    输入响应的json文件，输出需要的数据
    '''
    
    data = json_data['data']['cards'][1:][0]
    infomations = [value for key, value in data.items() if key=='card_group'][0][1:]
    
    created_ats = []
    mids = []
    uids = []
    texts = []
    sources = []
    pics = []
    
    for infomation in infomations:
    
        mblog = infomation['mblog']
    
        try:
            created_ats.append(mblog['created_at'])
        except:
            created_ats.append('Tue Jan 01 00:00:00 +0800 2025')
        try:
            mids.append(str(mblog['mid']))
        except:
            mids.append('无数据')
        try:
            uids.append(str(mblog['user']['id']))
        except:
            uids.append('无数据')
        try:
            texts.append(mblog['text'])
        except:
            texts.append('无数据')
        try:
            sources.append(mblog['source'])
        except:
            sources.append('无数据')
        try:
            pics.append([pic['url'] for pic in mblog['pics'][0:-1]])
        except:
            pics.append([])
    
    df = pd.DataFrame(data={
        'created_at': created_ats,
        'mid': mids,
        'uid': uids,
        'text': texts,
        'source': sources,
        'pics': pics
    })

    return df

def get_data_page_(json_data):
    '''
    用于获取后续页面的数据
    输入响应的json文件，输出需要的数据
    '''
    
    data = json_data['data']['cards'][0]
    infomations = [value for key, value in data.items() if key=='card_group'][0]
    
    created_ats = []
    mids = []
    uids = []
    texts = []
    sources = []
    pics = []

    for infomation in infomations:
        
            mblog = infomation['mblog']
        
            try:
                created_ats.append(mblog['created_at'])
            except:
                created_ats.append('Tue Jan 01 00:00:00 +0800 2025')
            try:
                mids.append(str(mblog['mid']))
            except:
                mids.append('无数据')
            try:
                uids.append(str(mblog['user']['id']))
            except:
                uids.append('无数据')
            try:
                texts.append(mblog['text'])
            except:
                texts.append('无数据')
            try:
                sources.append(mblog['source'])
            except:
                sources.append('无数据')
            try:
                pics.append([pic['url'] for pic in mblog['pics'][0:-1]])
            except:
                pics.append([])
        
    df = pd.DataFrame(data={
        'created_at': created_ats,
        'mid': mids,
        'uid': uids,
        'text': texts,
        'source': sources,
        'pics': pics
    })
    
    return df

def convert_time(time_str):
    input_format = '%a %b %d %H:%M:%S %z %Y'  # 定义输入时间字符串的格式
    parsed_time = datetime.strptime(time_str, input_format)  # 使用 strptime 解析时间字符串
    
    output_format = '%Y-%m-%d %H:%M:%S'  # 定义输出时间字符串的格式
    formatted_time = parsed_time.strftime(output_format)  # 使用 strftime 格式化时间字符串
    
    return formatted_time

### 加载参数

In [16]:
with open('config/weibo_cookies.txt', 'r', encoding='utf-8') as f:
    cookies_dict = json.loads(f.read())  # 用户 cookies

In [17]:
df_poiid = pd.read_sql('select * from shanghai_poi_level1', con=engine)
df_poiid

Unnamed: 0,poiid,title,lon,lat,address,describe
0,B2094253D36FA7F84899,北随塘河路,121.29991,30.702904,上海市金山区,类型: 地点\n地点: 上海市金山区石化街区
1,B2094551D76FA6F54299,金山城市沙滩骑马场,121.336519951102,30.705700220516,大堤路(近卫二路),类型: 未知分类\n地点: 上海市金山区大堤路(近卫二路)
2,B2094757D66DA4F84599,金山城市沙滩世游赛赛场,121.34811634,30.706189957,上海金山新城路,类型: 未知分类\n地点: 上海市金山区石化街区上海金山新城路
3,B2094250D16CA7FC489A,鹦鹉洲生态湿地-科普长廊(北),121.33765,30.70624,沪杭公路7741临东南方向100米,类型: 博物馆\n地点: 上海市金山区沪杭公路7741临东南方向100米
4,B2094551D069A5FE4993,上海金山城市沙滩海景房—悠然假期普通公寓,121.349153,30.71006,上海金山区象州路59号石化十三村小区37号502,类型: 出行住宿\n地点: 上海市金山区石化街区上海象州路59号石化十三村小区37号502
...,...,...,...,...,...,...
8141,B2094757D069A1FE449D,明珠湖,121.261181,31.745504,三华公路333号,类型: 公园\n地点: 上海市崇明区绿华镇三华公路333号\n简介: 明珠湖地处崇明岛西部绿...
8142,B2094757D16DAAF9429F,崇明绿华镇,121.22053,31.76256,上海崇明最西端,类型: 乡镇\n地点: 上海市崇明区绿华镇上海崇明最西端
8143,B2094252D465A6F84598,上海玉海棠景区,121.3338895,31.7641816,北星公路１９９９号,类型: 景点\n地点: 上海市崇明区三星镇北星公路１９９９号
8144,B2094252D46CA3F9469A,崇明岛日落点,121.1704455,31.7925259,绿华西路与上景路交叉口西120米,类型: 休闲娱乐\n地点: 上海市崇明区绿华西路与上景路交叉口西120米


In [18]:
df_crawled = pd.read_sql('select * from shanghai_crawled_poiid', con=engine)
df_crawled

Unnamed: 0,poiid
0,B2094253D36FA7F84899
1,B2094551D76FA6F54299
2,B2094757D66DA4F84599
3,B2094551D069A5FE4993
4,B2094757D069A0F5459B
...,...
2651,B2094654DA6DA4FA489E
2652,B209425DD26BA0F9499F
2653,B2094253D76CA7F84598
2654,B2094252D56AA2FD469D


In [19]:
df_error = pd.read_sql('select * from shanghai_error_poiid', con=engine)
df_error

Unnamed: 0,poiid
0,B2094250D16CA7FC489A
1,B2094251D764A6FF459F
2,B2094554D26FA5F9439F
3,B2094654DB69A0FC419D
4,B2094654DB69A1FF4793
...,...
545,B2094257D764AAF8419B
546,B2094251D76DABF5419B
547,B2094752D364A4FD4198
548,B209445DD76FA5FF469D


In [20]:
df_poiid = df_poiid[~df_poiid['poiid'].isin(df_crawled.poiid)].reset_index(drop=True)
df_poiid = df_poiid[~df_poiid['poiid'].isin(df_error.poiid)].reset_index(drop=True)
df_poiid

Unnamed: 0,poiid,title,lon,lat,address,describe
0,B2094255D56AA3F5489F,艮上(中海环宇荟店),121.476234,31.211243,黄陂南路838弄中海环宇荟F1,类型: 甜品店\n地点: 上海市黄浦区黄陂南路838弄中海环宇荟F1
1,B2094750D665A3F44492,胖子面,121.473877,31.211288,建国东路473号(马当路地铁站5号口步行320米),辣肉面 排骨年糕 浇头面 大肠面香菇面筋 辣肉 锅贴 菌菇浇头 爆炒猪肝面\n类型: 餐饮\...
2,B2094253D66CAAFE4898,夏朵花园(徐汇店),121.437777,31.211313,复兴西路268号,类型: 西餐厅\n地点: 上海市徐汇区复兴西路268号
3,B2094654DB6AA7FC489E,夏朵花园(徐汇店),121.437777,31.211313,复兴西路268号,鹅肝 牛排 下午茶 咖啡 芝士蛋糕 提拉米苏 熔岩巧克力蛋糕 鳕鱼 熔岩巧克力 龙虾意大利面...
4,B2094750D56AA3F54599,夏宫,121.61622,31.21132,香楠路４８８弄１～２０号,类型: 住宅\n地点: 上海市浦东新区香楠路４８８弄１～２０号
...,...,...,...,...,...,...
4985,B2094757D069A1FE449D,明珠湖,121.261181,31.745504,三华公路333号,类型: 公园\n地点: 上海市崇明区绿华镇三华公路333号\n简介: 明珠湖地处崇明岛西部绿...
4986,B2094757D16DAAF9429F,崇明绿华镇,121.22053,31.76256,上海崇明最西端,类型: 乡镇\n地点: 上海市崇明区绿华镇上海崇明最西端
4987,B2094252D465A6F84598,上海玉海棠景区,121.3338895,31.7641816,北星公路１９９９号,类型: 景点\n地点: 上海市崇明区三星镇北星公路１９９９号
4988,B2094252D46CA3F9469A,崇明岛日落点,121.1704455,31.7925259,绿华西路与上景路交叉口西120米,类型: 休闲娱乐\n地点: 上海市崇明区绿华西路与上景路交叉口西120米


### 抓取数据

In [21]:
for index, row in df_poiid[['poiid', 'title']].iterrows():
    poiid, name = row['poiid'], row['title']
    print(f'poiid: {poiid}')
    break

poiid: B2094255D56AA3F5489F


In [None]:
for index, row in df_poiid[['poiid', 'title']].iterrows():
    poiid, name = row['poiid'], row['title']
    print(f'poiid: {poiid}')
    page = 1
    
    while True:
        
        url = f'https://m.weibo.cn/api/container/getIndex?containerid=2306570042{poiid}&luicode=10000011&lfid=100103&q={name}&t=&page={page}'
        res = requests.get(url, params=cookies_dict)
        try:
            json_data = json.loads(res.text)
    
            if json_data['ok'] == 0:
                break
            
            if page==1:
                try:
                    df = get_data_page1(json_data)
                    df['poiid'] = poiid
                    df['created_at'] = df['created_at'].apply(convert_time)
                    df.to_sql('shanghai_check-ins', con=engine, if_exists='append', index=False)
                    page += 1
                    # crawled_poiid = pd.DataFrame(data={'poiid':[poiid]})
                    pd.DataFrame(data={'poiid':[poiid]}).to_sql('shanghai_crawled_poiid', con=engine, if_exists='append', index=False)
                except Exception as e:
                    print(f'poiid {poiid} search failed: ', e)
                    pd.DataFrame(data={'poiid':[poiid]}).to_sql('shanghai_error_poiid', con=engine, if_exists='append', index=False)
                    break
            else:
                try:
                    df = get_data_page_(json_data)
                    df['poiid'] = poiid
                    df['created_at'] = df['created_at'].apply(convert_time)
                    df.to_sql('shanghai_check-ins', con=engine, if_exists='append', index=False)
                    page += 1
                except Exception as e:
                    print(f'poiid {poiid} search failed: ', e)
                    pd.DataFrame(data={'poiid':[poiid]}).to_sql('shanghai_error_poiid', con=engine, if_exists='append', index=False)
                    break
        except Exception as e:
            print(f'poiid {poiid} search failed: ', e)
            pd.DataFrame(data={'poiid':[poiid]}).to_sql('shanghai_error_poiid', con=engine, if_exists='append', index=False)
    print('\n')

poiid: B2094255D56AA3F5489F


poiid: B2094750D665A3F44492
poiid B2094750D665A3F44492 search failed:  Expecting value: line 1 column 1 (char 0)


poiid: B2094253D66CAAFE4898


poiid: B2094654DB6AA7FC489E
