In [None]:
import json
import time
import random
import pandas as pd
import requests as rq
from copyheaders import headers_raw_to_dict

headers=b"""
user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36
"""

headers=headers_raw_to_dict(headers)

def get_one_page_barrage(url,headers):
    """
    爬取对应url的弹幕信息
    """
    html=rq.get(url,headers=headers)
    text=html.text
    start_index=text.find("{")
    json_data=json.loads(text[start_index:-1],strict=False)
    columns=['用户名','内容','会员等级','评论时间点','评论点赞数','评论id']
    df=pd.DataFrame(columns=columns)
    for item in json_data['comments']:
        content = item['content']  #弹幕内容
        name = item['opername']    #用户名
        upcount = item['upcount']  #点赞数
        user_degree =item['uservip_degree'] #会员等级
        timepoint = item['timepoint']  #发布时间
        comment_id = item['commentid']  #弹幕ID
        temp=pd.Series([name,content,user_degree,timepoint,upcount,comment_id],index=columns)
        df=df.append(temp,ignore_index=True)
    return df 

def parse_epid(headers):
    """
    获取每集的id和其他属性
    """
    url1='https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=x003061htl5,t00306i1e62,x003061htl5,b0030velala,w0030ilim7z,i0030r7v63u,z003044noq2,m0030sfinyr,c0030u884k7,k0030m5zbr7,l0030e5nglm,h0030b060vn,j003090ci7w,n0030falyoi,s00308u9kwx,p0030fohijf,g00303ob0cx,v0030960y6n,x0030bl84xw,v0030keuav1,t0030kups1i,n0030y2o52i,x0030s52mev,d0030xuekgw,o0030md1a2a,x0030peo3sk,d00303l5j4k,t0030aexmnt,a0030ybi45z,y0030wpe2wu&callback=jQuery19102114742155319942_1555398342372&_=1555398342375'
    url2='https://union.video.qq.com/fcgi-bin/data?otype=json&tid=682&appid=20001238&appkey=6c03bbe9658448a4&idlist=t0030epjqsi,g003035mi84,n00301fxqbh,h0030zivlrq,d0030qc1yu2,m0030q9ywxj,h0030j0eq19,j0030jks835,a00308xw434,l0030tb319m,x0030xogl32,g0030fju3w3,a0030vrcww0,l0030jzi1mi,c0030mq8yjr,u00302fdo8v,a0030w9g57k,n0030wnj6i8,j0030h91ouj,j00304eu73n,t00305kc1f5,i0030x490o2,u0030jtmlj2,d003031ey5h,w0850w594k6,l0854pfn9lg,f08546r7l7a,d0854s0oq1z,m08546pcd9k,p0854r1nygj&callback=jQuery19102114742155319942_1555398342367&_=1555398342376'
    columns=['id','title','播放量','集号']
    df=pd.DataFrame(columns=columns)
    
    for url in [url1,url2]:
        html=rq.get(url,headers)
        text=html.text
        json_data=json.loads(text[text.find("{"):-1])
        for item in json_data['results']:
            video_id=item['id']
            title=item['fields']['title']
            view_counts=item['fields']['view_all_count']
            episode=int(item['fields']['episode'])
            if episode==0:
                pass
            else:
                temp=pd.Series([video_id,title,view_counts,episode],index=columns)
                df=df.append(temp,ignore_index=True)
                
    df=df.drop_duplicates(['id'])
    return df.sort_values("集号").reset_index(drop=True)

def parse_targetId_by_vId(vId,headers):
    """
    对于每个vid，获取其对应的target_id
    """
    base_url='https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
    payload={"wRegistType":2,"vecIdList":[vId],
       "wSpeSource":0,"bIsGetUserCfg":1,
       "mapExtData":{vId:{"strCid":"wu1e7mrffzvibjy","strLid":""}}}
    
    html=rq.post(base_url,data=json.dumps(payload),headers=headers)
    json_data=json.loads(html.text)
    target_id_str=json_data['data']['stMap'][vId]['strDanMuKey']
    target_id=target_id_str[target_id_str.find("targetid=")+9:target_id_str.find("vid")-1]
    return (vId,target_id)

def get_all_danmukey():
    """
    爬取所有target_id 与 v_id的映射
    """
    df=parse_epid(headers)
    info_lists=[]
    for i in df['id']:
        info=parse_targetId_by_vId(i,headers)
        if info:
            info_lists.append(info)
            time.sleep(0.5+random.random())
    
    columns=['v_id','target_id']
    df_id=pd.DataFrame(info_lists,columns=columns)
    
    merged_df=pd.merge(df,df_id,left_on="id",right_on="v_id",how="inner") 
    return merged_df

def parse_urls(v_id,target_id,page):
    """
    format初始url
    """
    base_url="https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910029789363731311136_1555394558009&timestamp={page}&target_id={target_id}%26vid%3D{v_id}&count=80" 
    for i in range(15,30*page+15,30):
        yield base_url.format(page=i,target_id=target_id,v_id=v_id)

def crawl_all(df,num,page,headers):
    """
    爬取弹幕的信息
    """
    df_final=pd.DataFrame()
    count=1
    for v_id,target_id in zip(df['id'][:num],df['target_id'][:num]):
        print(f"正在爬取第{count}集")
        for url in parse_urls(v_id,target_id,page):
            df_one_page=get_one_page_barrage(url,headers)
            df_final=pd.concat([df_final,df_one_page])
            time.sleep(0.5+random.random())
        count+=1
    print("爬取结束")
    return df_final
    
if __name__=="__main__":
    df_id_mapping=get_all_danmukey()
    df_id_mapping.to_csv("danmukey.csv",encoding="utf_8_sig",index=0)
    df_data=crawl_all(df_id_mapping,1,5,headers)
    df_data.to_csv("danmu_data.csv",encoding="utf_8_sig",index=0)
                                              

