In [28]:
from typing import Generator, List, Dict, Any
from dataclasses import dataclass, asdict
import json
from googleapiclient.discovery import build
import time
import os

In [19]:
DEVELOPER_KEY = json.load(open('../keys/youtube_key.json'))['api_key']

In [20]:
@dataclass
class Comment:
    author: str
    content: str
    date: str
    like_count: int


@dataclass
class Thread:
    video_id: str
    reply_count: int
    top_level_comment: Comment
    replies: List[Comment]


In [21]:
class YoutubeCommentsScraper:

    def __init__(
            self,
            api_key: str,
            api_service_name: str = "youtube",
            api_version: str = "v3"
    ) -> None:
        self.API_KEY = api_key

        self.youtube = build(
            api_service_name,
            api_version,
            developerKey=self.API_KEY)

    def fetch_threads(
            self,
            video_id: str
    ) -> Generator[Thread, None, None]:
        request = self.youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            textFormat="plainText",
            order="relevance",
        )
        while request:
            response = request.execute()
            for thread in response["items"]:
                replies = list()
                snippet = thread["snippet"]
                reply_count = snippet["totalReplyCount"]
                main_comment_data = snippet["topLevelComment"]
                top_level_comment = self._comment_from_resource(main_comment_data)
                for reply in self._fetch_reply_comments(main_comment_data["id"]):
                    replies.append(reply)
                yield Thread(
                    video_id=video_id,
                    reply_count=reply_count,
                    top_level_comment=top_level_comment,
                    replies=replies
                )
            request = self.youtube.commentThreads().list_next(request, response)

    def _fetch_reply_comments(
            self,
            parent_comment_id: str
    ) -> Generator[Comment, None, None]:
        request = self.youtube.comments().list(
            part="snippet",
            parentId=parent_comment_id,
            maxResults=100,
            textFormat="plainText"
        )
        while request:
            response = request.execute()
            time.sleep(0.1)
            for comment in response.get("items", []):
                yield self._comment_from_resource(comment)
            request = self.youtube.comments().list_next(request, response)

    @staticmethod
    def _comment_from_resource(
            resource: Dict[str, Any]
    ) -> Comment:
        snippet = resource["snippet"]
        return Comment(
            author=snippet["authorDisplayName"],
            content=snippet["textDisplay"],
            date=snippet["publishedAt"],
            like_count=snippet["likeCount"],
        )


In [29]:
class NDJSONThreadSaver:  #Newline Delimited JSON

    def __init__(self, data_dir: str = 'data'):
        os.makedirs(data_dir, exist_ok=True)
        self.data_dir = data_dir

    def save(self, thread: Thread):
        file_path = os.path.join(self.data_dir, f"{thread.video_id}.ndjson")

        with open(file_path, 'a', encoding='utf-8') as f:
            serializable_thread = asdict(thread)
            json.dump(serializable_thread, f, ensure_ascii=False)
            f.write("\n")

In [30]:
# Testing

scraper = YoutubeCommentsScraper(
    api_key=DEVELOPER_KEY
)

ndjson_saver = NDJSONThreadSaver()

for idx, thread in enumerate(scraper.fetch_threads(video_id="qCbfTN-caFI")):
    ndjson_saver.save(thread)
    print(f"[{idx}] Fetched thread of author {thread.top_level_comment.author} with {thread.reply_count} replies")

[0] Fetched thread of author @lexfridman with 667 replies
[1] Fetched thread of author @乂 with 722 replies
[2] Fetched thread of author @HeavensLegacy with 734 replies


KeyboardInterrupt: 