In [6]:
import re
import requests
import time
import bs4
import json
import pandas as pd
import jieba

def getBLVData(video_url):
    """
    Scrape video data frome a specified Bilibili video url and
    returns data as a pandas dataframe.
    """
    cid = None
    r = requests.get(video_url)
    if r is not None:
        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        
        #scrape A version
        title = soup.find('div', class_ = 'v-title')
        if title is not None:
            vname = title.text
            #video category
            category = ""
            aTags = soup.findAll('a', href=re.compile('/video'))
            for aTag in aTags:
                category = category + "/" + aTag.text
                #unique ids
                idstr = soup.find('div', class_ ='scontent').text
                cid = re.search('cid=\d+',idstr).group()[4:]
                aid = re.search('aid=\d+',idstr).group()[4:]
        else:
            start_str = 'window.__INITIAL_STATE__='
            end_str = ';(function(){var s;(s=document.currentScript||document.scripts[document.scripts.length-1]).parentNode.removeChild(s);}());'
    
            jsonTags = soup.findAll('script',type = False)
            for jsonTag in jsonTags:
                if jsonTag.text.startswith(start_str) and jsonTag.text.endswith(end_str):
                    data = json.loads(jsonTag.text[25:-122])
                    aid = data['aid']
                    vname = str(data['videoData']['title'])
                    cid = str(data['videoData']['pages'][0]['cid'])
                    main_cat = "/"+data['videoData']['breadcrumb']['first']['name']
                    sec_cat = "/"+data['videoData']['breadcrumb']['second']['name']
                    category = main_cat + sec_cat + "/高级弹幕"
        
        if cid is not None:
            bsxml = 'http://comment.bilibili.tv/'+ cid+'.xml'
            bsr = requests.get(bsxml)
            if bsr is not None:
                bs_list = []
                bssoup = bs4.BeautifulSoup(bsr.text, 'xml')
                dTags = bssoup.findAll('d', p = True)
                for dTag in dTags:
                    bs_list.append(dTag.text)
                return (vname, category, aid, cid, bs_list)

In [2]:
def get_rank(filename):
    '''
    Get a list of videos in the rank.
    Input: filename - the name of the json file as a string
    Example:
        'all-3-0.json'
    Return:
        a list of aid of videos in the ranking
    '''
    aid_lst = []
    rank = json.load(open(filename))
    v_lst = rank['rank']['list']
    for item in v_lst:
        aid_lst.append(int(item['aid']))
        if 'others' in item:
            for other in item['others']:
                aid_lst.append(int(other['aid']))
    return aid_lst

In [3]:
l = get_rank('all-3-0.json')

In [4]:
parsDict = {'video_title':[], 'video_url':[], 'category':[], 'BS_text':[] }
for n in l:
    time.sleep(2)
    video_url = 'https://www.bilibili.com/video/av{}/'.format(n)
    if getBLVData(video_url):
        print('Success: {}'.format(video_url))
        result = getBLVData(video_url)
        parsDict['video_title'].append(result[0])
        parsDict['video_url'].append(video_url)
        parsDict['category'].append(result[1])
        parsDict['BS_text'].append(result[4])
    else:
        print('Fail:{}'.format(n))

effective url: https://www.bilibili.com/video/av18942582/
Success: https://www.bilibili.com/video/av18942582/
effective url: https://www.bilibili.com/video/av18942582/
effective url: https://www.bilibili.com/video/av19113672/
Success: https://www.bilibili.com/video/av19113672/
effective url: https://www.bilibili.com/video/av19113672/
effective url: https://www.bilibili.com/video/av18947985/
title not none
Success: https://www.bilibili.com/video/av18947985/
effective url: https://www.bilibili.com/video/av18947985/
title not none
effective url: https://www.bilibili.com/video/av19054339/
title not none
Success: https://www.bilibili.com/video/av19054339/
effective url: https://www.bilibili.com/video/av19054339/
title not none
effective url: https://www.bilibili.com/video/av19036374/
title not none
Success: https://www.bilibili.com/video/av19036374/
effective url: https://www.bilibili.com/video/av19036374/
title not none
effective url: https://www.bilibili.com/video/av19013887/
title not no

effective url: https://www.bilibili.com/video/av19113087/
title not none
Success: https://www.bilibili.com/video/av19113087/
effective url: https://www.bilibili.com/video/av19113087/
title not none
effective url: https://www.bilibili.com/video/av18750357/
title not none
Success: https://www.bilibili.com/video/av18750357/
effective url: https://www.bilibili.com/video/av18750357/
title not none
effective url: https://www.bilibili.com/video/av19001077/
title not none
Success: https://www.bilibili.com/video/av19001077/
effective url: https://www.bilibili.com/video/av19001077/
title not none
effective url: https://www.bilibili.com/video/av18941192/
Success: https://www.bilibili.com/video/av18941192/
effective url: https://www.bilibili.com/video/av18941192/
effective url: https://www.bilibili.com/video/av19017819/
title not none
Success: https://www.bilibili.com/video/av19017819/
effective url: https://www.bilibili.com/video/av19017819/
title not none
effective url: https://www.bilibili.com/

Success: https://www.bilibili.com/video/av19111816/
effective url: https://www.bilibili.com/video/av19111816/
title not none
effective url: https://www.bilibili.com/video/av19054940/
title not none
Success: https://www.bilibili.com/video/av19054940/
effective url: https://www.bilibili.com/video/av19054940/
title not none
effective url: https://www.bilibili.com/video/av19023607/
title not none
Success: https://www.bilibili.com/video/av19023607/
effective url: https://www.bilibili.com/video/av19023607/
title not none
effective url: https://www.bilibili.com/video/av18842862/
Success: https://www.bilibili.com/video/av18842862/
effective url: https://www.bilibili.com/video/av18842862/
effective url: https://www.bilibili.com/video/av19053714/
title not none
Success: https://www.bilibili.com/video/av19053714/
effective url: https://www.bilibili.com/video/av19053714/
title not none
effective url: https://www.bilibili.com/video/av18760765/
title not none
Success: https://www.bilibili.com/video/

In [7]:
BLDF = pd.DataFrame(parsDict)
BLDF

Unnamed: 0,BS_text,category,video_title,video_url
0,"[哇..我大金木, 有人吗, 哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈【中二病也要谈恋爱】, p5通宵...",/游戏/网络游戏/高级弹幕,当你在倒下的对手面前念中二台词会发生什么,https://www.bilibili.com/video/av18942582/
1,"[233333333, 233333333333333333333, 23333333333...",/游戏/网络游戏/高级弹幕,当你在倒下的敌人面前念中二台词#2,https://www.bilibili.com/video/av19113672/
2,"[Ahhhhh, 厉害, 我们北方人家里温暖如春, 233, 火钳刘明, 厉害了, PG T...",/鬼畜/鬼畜调教/高级弹幕,央视主播朱广权：是我的段子冷还是天气冷,https://www.bilibili.com/video/av18947985/
3,"[@万磁王, @李博, 快去冰岛，瘟疫公司最难进的地方, 小蜘蛛这次很强, 狼叔一米六, 大...",/动画/综合/高级弹幕,金刚狼大战丧尸！漫威食人魔宇宙 第一章,https://www.bilibili.com/video/av19054339/
4,"[hhh, 无语, 李维嘉那个, 墨家耻辱, 青蛙不是会游泳吗？, 屁几万哈哈哈哈哈, 前方...",/游戏/单机游戏/高级弹幕,【老E】这是我玩过最智障的中华题材游戏.....Emmmm.....,https://www.bilibili.com/video/av19036374/
5,"[666666666666, 〇AVI, 哈哈哈哈哈最后一个是大队长吗, 这男的还有点小帅是...",/娱乐/综艺/高级弹幕,【暴走大事件第五季】45 旅行青蛙外出真相感动网友，赶尸创业揭秘致富骗局,https://www.bilibili.com/video/av19013887/
6,"[0., 弹幕呢？？？？, 纸巾已经秃到必须用帽子和眼睛遮掩的地步了好可怕, 没想到吧, 没...",/影视/影视杂谈/高级弹幕,【暴走看啥片儿第三季】124 前任3：就此永别吧前任；影视剧里的扮猪吃老虎，太爽了皇上,https://www.bilibili.com/video/av19056673/
7,"[..., 一小时前！, 66666, 特别鸣谢扎心。, 弗利沙大王，笑死我了, 所以越来越...",/游戏/单机游戏/高级弹幕,【暴走玩啥游戏第二季】73 中二洗剪吹横行怪猎世界 老夫要传火蹦跶蜡烛人,https://www.bilibili.com/video/av19096103/
8,"[第六次, 这叫二胡吧，你认不认识古筝的, 我又回来辣, 1.5倍速 原曲, 6批6批, ...",/音乐/三次元音乐/高级弹幕,《极乐净土》官方中文版,https://www.bilibili.com/video/av18857413/
9,"[独臂少年2333, 哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈, 我系古天乐, 23333, ...",/游戏/单机游戏/高级弹幕,【老番茄】和非洲猛男的那一夜,https://www.bilibili.com/video/av19025451/


In [8]:
def cleanBStext(BStext):
    loc = BStext.find('var names =')
    if loc:
        BStext = BStext[:loc]
    return BStext

#Data Cleaning
BLDF['main category'] = BLDF['category'].str.split('/').map(lambda x: x[1])
BLDF['sub category'] = BLDF['category'].str.split('/').map(lambda x: x[2])
BLDF['BS_text'] = BLDF['BS_text'].apply(lambda x: ''.join(x)).apply(lambda x: cleanBStext(x))
BLDF

Unnamed: 0,BS_text,category,video_title,video_url,main category,sub category
0,哇..我大金木有人吗哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈【中二病也要谈恋爱】p5通宵第一66666...,/游戏/网络游戏/高级弹幕,当你在倒下的对手面前念中二台词会发生什么,https://www.bilibili.com/video/av18942582/,游戏,网络游戏
1,2333333332333333333333333333332333333333333333...,/游戏/网络游戏/高级弹幕,当你在倒下的敌人面前念中二台词#2,https://www.bilibili.com/video/av19113672/,游戏,网络游戏
2,Ahhhhh厉害我们北方人家里温暖如春233火钳刘明厉害了PG THREE!😂牛逼火钳刘名臊...,/鬼畜/鬼畜调教/高级弹幕,央视主播朱广权：是我的段子冷还是天气冷,https://www.bilibili.com/video/av18947985/,鬼畜,鬼畜调教
3,@万磁王@李博快去冰岛，瘟疫公司最难进的地方小蜘蛛这次很强狼叔一米六大结局去格林兰岛，然后封...,/动画/综合/高级弹幕,金刚狼大战丧尸！漫威食人魔宇宙 第一章,https://www.bilibili.com/video/av19054339/,动画,综合
4,hhh无语李维嘉那个墨家耻辱青蛙不是会游泳吗？屁几万哈哈哈哈哈前方出现老实人，嘘让他自己尴尬...,/游戏/单机游戏/高级弹幕,【老E】这是我玩过最智障的中华题材游戏.....Emmmm.....,https://www.bilibili.com/video/av19036374/,游戏,单机游戏
5,666666666666〇AVI哈哈哈哈哈最后一个是大队长吗这男的还有点小帅是我的错觉么好听...,/娱乐/综艺/高级弹幕,【暴走大事件第五季】45 旅行青蛙外出真相感动网友，赶尸创业揭秘致富骗局,https://www.bilibili.com/video/av19013887/,娱乐,综艺
6,0.弹幕呢？？？？纸巾已经秃到必须用帽子和眼睛遮掩的地步了好可怕没想到吧没有弹幕？期待上线全...,/影视/影视杂谈/高级弹幕,【暴走看啥片儿第三季】124 前任3：就此永别吧前任；影视剧里的扮猪吃老虎，太爽了皇上,https://www.bilibili.com/video/av19056673/,影视,影视杂谈
7,...一小时前！66666特别鸣谢扎心。弗利沙大王，笑死我了所以越来越多“学好汉语只为骂人”...,/游戏/单机游戏/高级弹幕,【暴走玩啥游戏第二季】73 中二洗剪吹横行怪猎世界 老夫要传火蹦跶蜡烛人,https://www.bilibili.com/video/av19096103/,游戏,单机游戏
8,第六次这叫二胡吧，你认不认识古筝的我又回来辣1.5倍速 原曲6批6批#1.5哈哈哈哈啊15...,/音乐/三次元音乐/高级弹幕,《极乐净土》官方中文版,https://www.bilibili.com/video/av18857413/,音乐,三次元音乐
9,独臂少年2333哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈我系古天乐233332333333达芬...,/游戏/单机游戏/高级弹幕,【老番茄】和非洲猛男的那一夜,https://www.bilibili.com/video/av19025451/,游戏,单机游戏


In [11]:
# Dictionaries for different ranking types.
# Different rankings' urls differ in 4 parameters. For example:
# https://www.bilibili.com/ranking?spm_id_from=333.334.banner_link.1#!/all/0/0/3/
# 'all' means “全站榜”
# The first 0 means "全站“
# The second 0 means “全部投稿“
# 3 means “近期投稿”
rank_type_dict = {"全站榜": "all", "原创榜": "origin", "新番榜": "bangumi", "影视榜": "cinema", "新人榜": "rookie"}
category_dict = {"全站": "0", "动画": "1", "国创相关": "168", "音乐": "3", "舞蹈": "129", "游戏": "4", "科技": "36", \
                 "生活": "160", "鬼畜": "119", "时尚": "155", "娱乐": "5", "影视": "181", "番剧": "13", "国产动画": "167", \
                 "纪录片": "177", "电影": "23", "电视剧": "11"}
all_or_recent = {"全部投稿": "0", "近期投稿": "1"}
time = {"三日排行": "3", "日排行": "1", "周排行": "7", "月排行": "30"}