In [15]:
import pandas as pd
import numpy as np
import requests
import re
import time as tm
import json

###############   Url加载函数    #########################
#实现：通过request模拟请求获取url的页面html数据，这里是比较完整的一个url请求函数
#输入：url地址
#返回：解析后的url文本内容r.text
##########################################################
T1 = tm.time()
def fetchURL(url):
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    }
    try:
        r = requests.get(url,headers=headers)  #模拟http请求
        r.raise_for_status()        #请求结果状态
        return r.text
    except requests.HTTPError as e:
        print(e)
        print("HTTPError")
    except requests.RequestException as e:
        print(e)
    except:
        print("Unknown Error !")
        
###############   页面数据爬取函数    #########################
#实现：通过json解析html文本，获取所需信息：从B站游戏评论中获取评论内容，评分，时间，赞/踩，用户名，用户等级
#输入：html文本
#返回：commentlist，包含10条上述结果的二维数组。（每个页面有10条评论）
##########################################################
def parserHtml(html):
    s = json.loads(html)  #用json解析html文本
    #准备容器
    content=[]
    grade=[]
    publish_time=[]
    up_count=[]
    down_count=[]
    user_name=[]
    user_level=[]
    #抓取每一条评论记录的所需信息
    for i in range(10):
        try:
            comment = s['data']['list'][i]
            content.append(comment['content'])
            grade.append(comment['grade'])
            publish_time.append(comment['publish_time'])
            up_count.append(comment['up_count'])
            down_count.append(comment['down_count'])
            user_name.append(comment['user_name'])
            user_level.append(comment['user_level'])
        except:
            break
    #将本页面的10条记录合并保存下来，变成一个二维表格
    commentlist=[content,
                 grade,
                 publish_time,
                 up_count,
                 down_count,
                 user_name,
                 user_level]
    return commentlist      #返回这个页面的10条评论

#***********主程序************
#step 1 输入需要爬取的b站游戏的信息

app_id="200"    #王者荣耀的B站ID’200‘，每个游戏都有一个
page_total=50          #需要爬取的总页数

#准备容器
all_content=[]
all_grade=[]
all_publish_time=[]
all_up_count=[]
all_down_count=[]
all_user_name=[]
all_user_level=[]

print('*************************************************')
print('Grapping comment data of app:{0} from BiliBili...'.format(app_id))
print('*************************************************\n')

#step 2  爬取数据，循环爬取，每页获取10条评论数据
for pagenum in range(1,page_total+1):
    t1=tm.time()  #用于调试代码效率
    url="https://line1-h5-pc-api.biligame.com/game/comment/page?game_base_id={0}&rank_type=2&page_num={1}&page_size=10".format(app_id, pagenum)
    comment_page = fetchURL(url)  #加载url
    parsered_comment=parserHtml(comment_page) #解析一页
    
    #从解析后的json对象中获取所需要的信息
    all_content.extend(parsered_comment[0])
    all_grade.extend(parsered_comment[1])
    all_publish_time.extend(parsered_comment[2])
    all_up_count.extend(parsered_comment[3])
    all_down_count.extend(parsered_comment[4])    
    all_user_name.extend(parsered_comment[5])
    all_user_level.extend(parsered_comment[6])
    t2=tm.time()  #用于调试代码效率
    timing=t2-t1                                      
    print('Page %d grapped, %5.2f seconds used' % (pagenum,timing))  #输出爬取进度
    if pagenum%10 == 0:
            print('Taking a break... To avoid being detected')
            tm.sleep(5)
            
#step 3  爬取完成，整理并导出数据
result={"content" : all_content,
    "grade" : all_grade,
    "publish_time" : all_publish_time,
    "up_count" : all_up_count,
    "down_count" : all_down_count,
    "user_name" : all_user_name,
    "user_level" : all_user_level,
    } 
    
resultpd=pd.DataFrame(result)
print('\n*************************************************')
print('Comment grapping finished. %d comments grapped in total' % (len(resultpd['content'])))
resultpd.to_excel('Bili_comment_appid{}.xlsx'.format(app_id))
print('Written to 【Bili_comment_appid{}.xlsx】'.format(app_id))
print('*************************************************')
T2 = tm.time()
print('程序运行时间:%s秒' % ((T2 - T1)))


*************************************************
Grapping comment data of app:200 from BiliBili...
*************************************************

Page 1 grapped,  1.58 seconds used
Page 2 grapped,  1.14 seconds used
Page 3 grapped,  0.87 seconds used
Page 4 grapped,  0.92 seconds used
Page 5 grapped,  1.21 seconds used
Page 6 grapped,  0.89 seconds used
Page 7 grapped,  0.92 seconds used
Page 8 grapped,  0.91 seconds used
Page 9 grapped,  1.25 seconds used
Page 10 grapped,  1.11 seconds used
Taking a break... To avoid being detected
Page 11 grapped,  1.21 seconds used
Page 12 grapped,  1.39 seconds used
Page 13 grapped,  1.89 seconds used
Page 14 grapped,  0.78 seconds used
Page 15 grapped,  1.77 seconds used
Page 16 grapped,  1.23 seconds used
Page 17 grapped,  1.24 seconds used
Page 18 grapped,  0.80 seconds used
Page 19 grapped,  0.55 seconds used
Page 20 grapped,  0.72 seconds used
Taking a break... To avoid being detected
Page 21 grapped,  0.43 seconds used
Page 22 grapped,  

In [16]:
resultpd.head(50)

Unnamed: 0,content,grade,publish_time,up_count,down_count,user_name,user_level
0,虽然经常都在玩王者可是这个游戏真的太糟糕了根本不让我赢\n如果不是充了钱我玩都不玩,2,2021-12-12 21:44:36,0,0,硬币够了再改名,4
1,无话可说,2,2021-12-12 21:11:28,0,0,幻梦无枉,3
2,(◦˙▽˙◦),10,2021-12-12 21:09:55,0,0,一笑小白,3
3,*扎不多德勒！,2,2021-12-12 20:48:35,0,0,乌鱼子酱拌饭,5
4,垃圾游戏，我被安排了，给个一星不过分吧,2,2021-12-12 20:32:44,0,0,hwbbsuunw,5
5,非常好！,10,2021-12-12 20:27:38,0,0,卖个萌给爷看,4
6,垃圾游戏，一星都不想给！,2,2021-12-12 20:11:21,1,1,火石岩,4
7,我去(⇀‸↼‶)三次五连就出了荣耀水晶了，我近期是不是能吃就多吃点了，不然没机会了？？！,10,2021-12-12 19:28:42,6,0,Sunny李心,4
8,嫦娥的翻牌都得翻全了才能到下一关，该不会只有我是这么倒霉的吧，我不信！！,8,2021-12-12 19:14:20,7,0,欣新sunn,4
9,快过年了，是不是该得给我们瑶瑶公主来件新衣呢，不要多华丽的，类似嫦娥的那套也行。,10,2021-12-12 19:02:24,8,0,冬梅n花,4
