In [3]:
# import package
import os
import numpy as np
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from googleapiclient.errors import HttpError
import pandas as pd
import json
import socket
import socks
import requests
import pickle
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow

In [4]:
def use_credentials(credentials,client_secrets_file):
    """use google credentials(client_secret.json)/
    利用从谷歌api网站获取的client_secret.json文件，将其转换成pickle格式，/
    下次使用爬虫时会判断是否有这个piclke文件，有的话就继续使用/
    （使用证书的目的是每次爬虫运行不需要人工验证）"""
    scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            credentials = pickle.load(token)    
    #  Check if the credentials are invalid or do not exist 
    if not credentials or not credentials.valid:
        # Check if the credentials have expired
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(client_secrets_file, scopes)
            credentials = flow.run_console()

        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(credentials, token)
    return credentials

In [5]:
def get_comments(video_Id,credentials,api_service_name,api_version):
    """获取评论，需要传入的参数：/
    video_Id：YouTube视频Id号；credentials:谷歌api证书，从上一个use_credentials()获得/
    api_service_name，api_version会在下面的scraper()中得到;
    searchTerms：用于筛选评论的关键词，爬虫会爬取包含这些关键词的评论"""
    import googleapiclient.discovery
    youtube = googleapiclient.discovery.build(api_service_name, api_version, credentials=credentials)
    video_Id = video_Id
    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_Id,
        #searchTerms=searchTerms,
        maxResults = 100,
    )
    response = request.execute()

    totalResults = 0
    totalResults = int(response['pageInfo']['totalResults'])

    count = 0
    nextPageToken = ''
    comments = []
    first = True
    further = True
    while further:
        halt = False
        if first == False:
            print('..')
            try:
                response = youtube.commentThreads().list(
                    part="snippet,replies",
                    videoId=video_Id,
                    searchTerms=searchTerms,
                    maxResults = 100,
                    textFormat='html',
                    pageToken=nextPageToken
                            ).execute()
                totalResults = int(response['pageInfo']['totalResults'])
            except HttpError as e:
                print("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
                halt = True

        if halt == False:
            count += totalResults
            for item in response["items"]:
                # 这只是一部分数据，你需要啥自己选就行，可以先打印下你能拿到那些数据信息，按需爬取。
                comment = item["snippet"]["topLevelComment"]
                author = comment["snippet"]["authorDisplayName"]
                text = comment["snippet"]["textDisplay"]
                likeCount = comment["snippet"]['likeCount']
                publishtime = comment['snippet']['publishedAt']
                comments.append([author, publishtime, likeCount, text,])
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
                if int(item['snippet']['totalReplyCount']) >0:
                    parentID = item['id']
                    request2 = youtube.comments().list(part="snippet",parentId= parentID,maxResults = 100)
                    response2 = request2.execute()
                    nextPageToken2 = ''
                    first2 = True
                    further2 = True   # 是否查完第一页后还往下查
                    totalResults2 = int(len(response2['items']))
                    while further2:
                        halt2 = False  #是否终止
                        if first2 == False:  #是否是循环的第一次
                            print('..')
                            try:
                                response2 = youtube.comments().list(
                                    part="snippet",
                                    maxResults = 100,
                                    textFormat='plainText',
                                    parentId = parentID,
                                    pageToken=nextPageToken2
                                            ).execute()
                                totalResults2 = int(len(response2['items']))
                            except HttpError as e:
                                print("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
                                halt2 = True

                        if halt2 == False:
                            for item2 in response2["items"]:
                                # 这只是一部分数据，你需要啥自己选就行，可以先打印下你能拿到那些数据信息，按需爬取。
                                author = item2["snippet"]["authorDisplayName"]
                                text = item2["snippet"]["textDisplay"]
                                likeCount = item2["snippet"]['likeCount']
                                publishtime = item2['snippet']['publishedAt']
                                comments.append([author, publishtime, likeCount, text])
                            if totalResults2 < 100:
                                further2 = False     #如果这一次循环里的totalresult小于0则，不进行下一次循环，further就等于False
                                first2 = False
                            else:
                                further2 = True
                                first2 = False
                                try:
                                    nextPageToken2 = response2["nextPageToken"]
                                except KeyError as e:
                                    print("An KeyError error occurred: %s" % (e))
                                    further2 = False               
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
            if totalResults < 100:
                further = False
                first = False
            else:
                further = True
                first = False
                try:
                    nextPageToken = response["nextPageToken"]
                except KeyError as e:
                    print("An KeyError error occurred: %s" % (e))
                    further = False
    print('get comment count: ', str(count))
    ### write to csv file
    data = np.array(comments)
    print('total comments and replies: ',data.shape[0])
    df = pd.DataFrame(data, columns=['author', 'publishtime', 'likeCount', 'comment',])
    return df
    

In [6]:
def improve_format(df):
    """改善输出的格式"""
    df['comment']=df['comment'].str.replace('&#39;','\'')
    df['comment']=df['comment'].str.replace('<br />',' ')   
    df['comment']=df['comment'].str.replace('&quot','" ')     
    df = df[df['comment'].str.len()<=50]
    return df

In [None]:
def scrap(videoId_list):
    #1.初始化变量
    scopes = ["https://www.googleapis.com/auth/youtube.force-ssl"]
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "cici_client_secret.json"    #这个json文件是通过谷歌api下载的    
    credentials = None  
    videoId_list = videoId_list
    #2.将谷歌api的凭据(credentials)转换为pickle格式文件，然后使用    
    credentials = use_credentials(credentials,client_secrets_file)
            
    #3.
    for video_id in videoId_list:
        df = get_comments(video_id,credentials,api_service_name,api_version)
        #这里需要对df进行修改
        df = improve_format(df)
        output_filename = video_id + '_comments.csv'
        df.to_csv(output_filename, index=0, encoding='utf_8_sig')#utf_8_sig


In [14]:
video_list = ['PlESWcaAHKk','vkWH92FhT3M','8NEl21QYDdI&t=312s','AFw6MqXAOJ8','B0_G9_MBRk4','ozf26RgCm1A','scnUloWsz2g&t=1s','ZEJUb4cThlM']

In [13]:
scrap(video_list)

get comment count:  4
total comments and replies:  4
get comment count:  42
total comments and replies:  42
get comment count:  4
total comments and replies:  4
get comment count:  0
total comments and replies:  0
get comment count:  15
total comments and replies:  15
get comment count:  4
total comments and replies:  4
get comment count:  11
total comments and replies:  11
get comment count:  0
total comments and replies:  0
get comment count:  42
total comments and replies:  42
get comment count:  28
total comments and replies:  28
get comment count:  21
total comments and replies:  21
get comment count:  97
total comments and replies:  97
get comment count:  73
total comments and replies:  73
get comment count:  2
total comments and replies:  2
get comment count:  1
total comments and replies:  1
get comment count:  0
total comments and replies:  0
get comment count:  4
total comments and replies:  4
get comment count:  4
total comments and replies:  4
get comment count:  1
total co