In [45]:
# 导入操作系统接口模块，用于执行与操作系统相关的操作
import os

# 导入请求模块，用于发送HTTP请求
import requests

# 导入时间模块，用于处理时间相关的操作
import time

# 导入系统模块，用于访问系统特定的参数和函数
import sys

# 导入JSON处理模块，用于解析和处理JSON数据
import json

# 从Google API客户端库中导入构建函数，用于创建API服务对象
from googleapiclient.discovery import build

# 从Google API客户端库中导入HTTP错误处理模块，用于处理API调用时可能发生的HTTP错误
from googleapiclient.errors import HttpError

# 保存日志
import logging

In [14]:
# api
API_KEY = 'AIzaSyDFphGybwWRO5xx2c6W669sGT4UCrwwiPA'

# 使用YouTube Data API构建一个YouTube服务对象
# 这里的'youtube'是API服务名称，'v3'是API版本，developerKey是开发者密钥
youtube = build('youtube', 'v3', developerKey=API_KEY)

# 设置代理（用的不是真正的VPN的话，CMD ping不通google.com）
# 见：https://www.v2ex.com/t/877527
# 代理软件是不通 icmp 的，也就是 ping 不通的。
# 需要在Python中显示地配置代理
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

In [29]:
# 搜索视频
# 当调用 youtube.search().list() 方法时，API 返回一个包含搜索结果的 JSON 响应。

def search_videos(query,video_durations=['medium'],max_results=50):
    # query -> main()
    # max_results 此处不需要设置
    all_video_ids = []
    
    for duration in video_durations:
        request = youtube.search().list(
            part='snippet',  # 返回的数据部分，包含基本信息
            q=query,
            type='video',
            videoDuration=duration,  # 分别搜索短、中、长视频
            maxResults=max_results,  # 需要前100个视频，而单次请求最多只能返回50个，因此需要多次请求。
            order='relevance'  # 按相关性排序
        )
        response = request.execute()
        video_ids = [item['id']['videoId'] for item in response.get('items', [])]
        all_video_ids.extend(video_ids)
    
    return list(set(all_video_ids))  # 去重并返回视频唯一标识符列表


In [17]:
# 测试代码
def test_search_videos():
    query = 'black myth wukong'
    duration = 'medium'
    max_results = 1
    
    print(f"Searching for videos with query: '{query}', duration: '{duration}', max_results: {max_results}")
    
    request = youtube.search().list(
        part='snippet',
        q=query,
        type='video',
        videoDuration=duration,
        maxResults=max_results,
        order='relevance'
    )
    
    response = request.execute()
    
    # 打印完整的JSON响应
    print("Full JSON Response:")
    print(json.dumps(response, ensure_ascii=False, indent=4))
    
    # 提取并打印视频ID列表
    video_ids = [item['id']['videoId'] for item in response.get('items', [])]
    print("\nExtracted Video IDs:")
    print(video_ids)

test_search_videos()

Searching for videos with query: 'black myth wukong', duration: 'medium', max_results: 1
Full JSON Response:
{
    "kind": "youtube#searchListResponse",
    "etag": "W17ZIJ8ZulMIQJgkIbpP9iBRB0o",
    "nextPageToken": "CAEQAA",
    "regionCode": "SG",
    "pageInfo": {
        "totalResults": 670096,
        "resultsPerPage": 1
    },
    "items": [
        {
            "kind": "youtube#searchResult",
            "etag": "3_ZmpM2DeAavg0wFwepv3kdwHW0",
            "id": {
                "kind": "youtube#video",
                "videoId": "333otzqZq3w"
            },
            "snippet": {
                "publishedAt": "2024-12-18T23:49:30Z",
                "channelId": "UCzF5oxzeidHOZzy4KK5nxCQ",
                "title": "Black Myth Wukong Just Got The Biggest Update Ever...",
                "description": "Black Myth Wukong update (Black Myth Wukong Patch while we wait for black myth wukong DLC) Like the video? Subscribe now: ...",
                "thumbnails": {
                

In [44]:

def get_video_comments(video_id, max_comments=200, max_replies=20):
    comments = []
    request = youtube.commentThreads().list(
        part='snippet,replies', # 获取评论的基本信息和回复。
        videoId=video_id,
        textFormat='plainText', # 选择 'plainText' 可以避免处理HTML标签。但缺点是爬不了emoji，可能会对情感分析有些许影响（也不一定）
        maxResults=min(50,max_comments), # 获取请求返回的最大结果数量。最大返回数为50，如果要求的max_comments比这个还小，就设定为max_comments。
        order='relevance'  # 可以是 'time' 或 'relevance'。选择 'time' 按时间顺序排列，选择 'relevance' 按相关性排列。
    )
    while len(comments) < max_comments:
        response = request.execute()     # 执行请求并获取响应
        for item in response.get('items', []):  # 遍历每个评论线程
            comment_snippet = item['snippet']['topLevelComment']['snippet']
            if comment_snippet['likeCount'] > -1:  # 忽略0赞评论 -> 不忽略了，设置为-1
                comment_data = {
                    'comment_id': comment_snippet['authorChannelId']['value'],  # 发布者ID
                    'text': comment_snippet['textDisplay'],                     # 评论文本
                    'likes': comment_snippet['likeCount'],                      # 获赞数
                    'replies': []
                }
                if 'replies' in item and len(item['replies']['comments']) > 0:  # 处理回复
                    replies = item['replies']['comments'][:max_replies]  # 限制回复数量为20条
                    for reply_item in replies:
                        reply_snippet = reply_item['snippet']
                        # 回复的话0赞也要
                        comment_data['replies'].append({
                            'reply_id': reply_snippet['authorChannelId']['value'],  # 回复发布者ID
                            'reply_text': reply_snippet['textDisplay'],             # 回复文本
                            'reply_likes': reply_snippet['likeCount']              # 回复获赞数
                        })
                comments.append(comment_data)
        if 'nextPageToken' in response:  # 检查是否有更多页
            request = youtube.commentThreads().list_next(request, response)  # 请求下一页
        else:
            break
    return comments[:max_comments]


In [20]:
# 测试代码
def test_get_video_comments():
    video_id = '333otzqZq3w'
    print(f"Fetching comments for video ID: {video_id}")
    
    try:
        response = get_video_comments(video_id)
        
        # 打印完整的JSON响应
        print("Full JSON Response:")
        print(json.dumps(response, ensure_ascii=False, indent=4))
    
    except Exception as e:
        print(f"An error occurred: {e}")

test_get_video_comments()

Fetching comments for video ID: 333otzqZq3w
Full JSON Response:
[
    {
        "comment_id": "UCtlZVpaJuul3oF7YoqW3fYQ",
        "text": "This is one of those games i wish i could replay for the first time again 😢 Its so damn good! WE NEED A BIG DLC LIKE ER: SOTET!",
        "likes": 28,
        "replies": [
            {
                "reply_id": "UCpce7MP8fJmnTLOUwSO6V8Q",
                "reply_text": "guarantee were gonna get one",
                "reply_likes": 0
            },
            {
                "reply_id": "UCWJEKqwq1reXuvPaDmB0PzQ",
                "reply_text": "Try elden ring, dark souls, sekiro, god of war",
                "reply_likes": 0
            }
        ]
    },
    {
        "comment_id": "UCd4KEX4C4aV7ATKuvoWFYFw",
        "text": "They added a lot of stuff sheesh",
        "likes": 10,
        "replies": []
    },
    {
        "comment_id": "UCw8-QrfRfQ0KIvBmSIc5jcg",
        "text": "Most deserved GOTY.",
        "likes": 31,
        "replies": []

In [31]:
# 数据存储
def save_to_json(data, file_path): # 保存数据到json文件
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [33]:
# 构建本地路径测试
data_dir = os.path.join(os.getcwd(), 'data')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [43]:
# 主程序
def main():
    # 设置查询参数
    query = 'black myth wukong'
    max_videos = 100    # 视频爬取最大数量
    max_comments = 200  # 每条视频评论爬取最大数量
    
    # 搜索视频
    videos = []
    video_duration = ['medium']

    print(f"Searching for videos with query: {query}")
    while len(videos) < max_videos:
        new_videos = search_videos(query, video_durations=video_duration, max_results=min(50, max_videos - len(videos)))
        if not new_videos:
            break
        videos.extend(new_videos)
    print(f"Found {len(videos)} videos.")

    # 获取评论
    all_comments = []
    comment_index = 1

    print(f"Fetching comments.")
    for video_id in videos:
        print(f"Fetching comments for video ID: {video_id}")
        comments = get_video_comments(video_id, max_comments=max_comments)
        print(f'{len(comments)} comments fetched.')
        for comment in comments:
            all_comments.append({
                'index': comment_index,
                'video_id': video_id,
                'comment_id': comment['comment_id'],
                'text': comment['text'],
                'likes': comment['likes'],
                'replies': comment['replies']
            })
            comment_index += 1
    print("Comments fetched.")

    # 构建数据目录的路径，基于当前工作目录
    print("Saving data to disk...")
    data_dir = os.path.join(os.getcwd(), 'data')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # 将 all_comments 数据保存到 data 目录下的 comments.json 文件中
    save_to_json(all_comments, os.path.join(data_dir, 'comments.json'))
    print("Data saved.")

In [48]:

if __name__ == '__main__':
    try:
        main()
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")


Searching for videos with query: black myth wukong
Found 100 videos.
Fetching comments.
Fetching comments for video ID: 2PHqki02vUI
200 comments fetched.
Fetching comments for video ID: yc5BE-TzK5k
200 comments fetched.
Fetching comments for video ID: qOsDHymg_Yo
200 comments fetched.
Fetching comments for video ID: LRkf0R6VDRA
200 comments fetched.
Fetching comments for video ID: 3mgNkc0XlyY
200 comments fetched.
Fetching comments for video ID: kyp-nCNGkz8
200 comments fetched.
Fetching comments for video ID: 35DvC36BZYI
200 comments fetched.
Fetching comments for video ID: JPmC0R_hnjs
200 comments fetched.
Fetching comments for video ID: lUVI2KIJw2c
200 comments fetched.
Fetching comments for video ID: qLBy5WJ4FI4
46 comments fetched.
Fetching comments for video ID: Ky3ZPaL31H0
200 comments fetched.
Fetching comments for video ID: GYXbNX5_2QE
10 comments fetched.
Fetching comments for video ID: fY4JCD8guR8
200 comments fetched.
Fetching comments for video ID: YpF3ZRRo0n4
80 comments 