In [1]:
from typing import Generator, List, Dict, Any
from dataclasses import dataclass, asdict
import json
from googleapiclient.discovery import build
import time
import sqlite3
import os
import pandas as pd

In [2]:
DEVELOPER_KEY = json.load(open('../keys/youtube_key.json'))['api_key']

KAMALA_CALL_HER_DADDY = "_KCRsjPCiCI"
KAMALA_WIRED = "u9-cjwpthz4"
KAMALA_ALL_THE_SMOKE = "bzThwqnQJDY"

TRUMP_LEX_FRIEDMAN = "qCbfTN-caFI"
TRUMP_JOE_ROGAN = "hBMoPUAeLnY"
TRUMP_THEO_VON = "vC5cHjcgt5g"


In [3]:
@dataclass
class Comment:
    id: str
    author_id: str
    author_name: str
    content: str
    date: str
    like_count: int


@dataclass
class Thread:
    video_id: str
    reply_count: int
    top_level_comment: Comment
    replies: List[Comment]


In [4]:
class YoutubeCommentsScraper:

    def __init__(
            self,
            api_key: str,
            api_service_name: str = "youtube",
            api_version: str = "v3"
    ) -> None:
        self.API_KEY = api_key

        self.youtube = build(
            api_service_name,
            api_version,
            developerKey=self.API_KEY)

    def fetch_threads(
            self,
            video_id: str
    ) -> Generator[Thread, None, None]:
        request = self.youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            textFormat="plainText",
            order="relevance",
        )
        while request:
            response = request.execute()
            for thread in response["items"]:
                replies = list()
                snippet = thread["snippet"]
                reply_count = snippet["totalReplyCount"]
                main_comment_data = snippet["topLevelComment"]
                top_level_comment = self._comment_from_resource(main_comment_data)
                for reply in self._fetch_reply_comments(main_comment_data["id"]):
                    replies.append(reply)
                yield Thread(
                    video_id=video_id,
                    reply_count=reply_count,
                    top_level_comment=top_level_comment,
                    replies=replies
                )
            request = self.youtube.commentThreads().list_next(request, response)

    def _fetch_reply_comments(
            self,
            parent_comment_id: str
    ) -> Generator[Comment, None, None]:
        request = self.youtube.comments().list(
            part="snippet",
            parentId=parent_comment_id,
            maxResults=100,
            textFormat="plainText"
        )
        while request:
            response = request.execute()
            time.sleep(0.1)
            for comment in response.get("items", []):
                yield self._comment_from_resource(comment)
            request = self.youtube.comments().list_next(request, response)

    @staticmethod
    def _comment_from_resource(
            resource: Dict[str, Any]
    ) -> Comment:
        snippet = resource["snippet"]
        return Comment(
            id=resource["id"],
            author_id=snippet["authorChannelId"]["value"],
            author_name=snippet["authorDisplayName"],
            content=snippet["textDisplay"],
            date=snippet["publishedAt"],
            like_count=snippet["likeCount"],
        )


In [5]:
class NDJSONThreadSaver:  #Newline Delimited JSON

    def __init__(self, data_dir: str = 'data'):
        os.makedirs(data_dir, exist_ok=True)
        self.data_dir = data_dir

    def save(self, thread: Thread):
        file_path = os.path.join(self.data_dir, f"{thread.video_id}.ndjson")

        with open(file_path, 'a', encoding='utf-8') as f:
            serializable_thread = asdict(thread)
            json.dump(serializable_thread, f, ensure_ascii=False)
            f.write("\n")

In [6]:
class SQLiteThreadSaver:

    def __init__(self, db_name: str = 'threads.db'):
        self.db_name = db_name
        self.conn = sqlite3.connect(self.db_name)
        self.cursor = self.conn.cursor()
        self._create_table()

    def _create_table(self):
        # Create the comments table
        self.cursor.execute('''
        CREATE TABLE IF NOT EXISTS Comments (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            yt_id TEXT UNIQUE,
            author_id TEXT,
            author_name TEXT,
            content TEXT,
            date TEXT,
            like_count INTEGER,
            parent_id INTEGER,
            video_id TEXT,
            FOREIGN KEY (parent_id) REFERENCES Comments(id)
        )''')
        self.conn.commit()

    def save(self, thread: Thread):
        # Save Top Level Comment
        top_level_comment_id = self._save_comment(
            thread.top_level_comment,
            video_id=thread.video_id,
        )

        if top_level_comment_id is None:
            print("Duplicated id. Skipped thread")
            return

        # Save replies
        for reply in thread.replies:
            self._save_comment(
                reply,
                parent_id=top_level_comment_id,
                video_id=thread.video_id,
            )

        self.conn.commit()

    def _save_comment(
            self,
            comment: Comment,
            video_id : str,
            parent_id : int | None = None
    ) -> int | None:
        # Insert the comment data into the comments table
        try:
            self.cursor.execute(
                '''
                    INSERT INTO comments (yt_id, author_id, author_name, content, date, like_count, parent_id, video_id)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    ''',
                (comment.id, comment.author_id, comment.author_name, comment.content, comment.date, comment.like_count, parent_id, video_id)
            )
        except:
            return None

        return self.cursor.lastrowid # Get id of the row the cursor is currently at

    def close(self):
        self.conn.close()

In [13]:
def fetch_500_threads(video_id : str):
    ndjson_saver = NDJSONThreadSaver()
    sqlite_saver = SQLiteThreadSaver()
    for idx, thread in enumerate(scraper.fetch_threads(video_id)):
        if idx > 500: # Stop at 500 threads
            break
        ndjson_saver.save(thread)
        sqlite_saver.save(thread)
        print(f"[{idx}] Fetched thread of author {thread.top_level_comment.author_name} with {thread.reply_count} replies")

    sqlite_saver.close()

def intersect_data(video_id_1 : str, video_id_2 : str) -> pd.DataFrame:
    query = f"""
    SELECT *
    FROM Comments
    WHERE author_id IN (
        SELECT author_id
        FROM Comments
        WHERE video_id = '{video_id_1}' AND author_id IS NOT NULL
        INTERSECT
        SELECT author_id
        FROM Comments
        WHERE video_id = '{video_id_2}' AND author_id IS NOT NULL
    )
    """

    con = sqlite3.connect('threads.db')
    df = pd.read_sql_query(query, con)
    con.close()
    return df

In [8]:
scraper = YoutubeCommentsScraper(
    api_key=DEVELOPER_KEY
)

In [None]:
fetch_500_threads(TRUMP_JOE_ROGAN)
fetch_500_threads(KAMALA_CALL_HER_DADDY)

In [15]:
fetch_500_threads(TRUMP_LEX_FRIEDMAN)
fetch_500_threads(TRUMP_THEO_VON)
fetch_500_threads(KAMALA_ALL_THE_SMOKE)
fetch_500_threads(KAMALA_WIRED)

[0] Fetched thread of author @lexfridman with 666 replies
[1] Fetched thread of author @乂 with 722 replies
[2] Fetched thread of author @HeavensLegacy with 734 replies
[3] Fetched thread of author @bernardvezina-gagnon24 with 723 replies
[4] Fetched thread of author @phunkyfill74 with 1 replies
[5] Fetched thread of author @joseph7988 with 744 replies
[6] Fetched thread of author @RunawayVet with 146 replies
[7] Fetched thread of author @TheJoshheart90 with 568 replies
[8] Fetched thread of author @realtwanieone with 0 replies
[9] Fetched thread of author @holtzlander with 282 replies
[10] Fetched thread of author @A_r78 with 578 replies
[11] Fetched thread of author @airbornranger8534 with 31 replies
[12] Fetched thread of author @GreekAekaras with 0 replies
[13] Fetched thread of author @tlucero6994 with 472 replies
[14] Fetched thread of author @johnyjoseph357 with 217 replies
[15] Fetched thread of author @urbanlegendwillevans with 98 replies
[16] Fetched thread of author @TheNWeez

In [21]:
df = intersect_data(TRUMP_THEO_VON, KAMALA_WIRED)
df

Unnamed: 0,id,yt_id,author_id,author_name,content,date,like_count,parent_id,video_id
0,3159,UgyjlYeTgGzIXo6WNDd4AaABAg.AA1wpWufy7jAA26pqsNLgS,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,Accurate,2024-10-26T03:19:15Z,2,3085,hBMoPUAeLnY
1,3174,UgyjlYeTgGzIXo6WNDd4AaABAg.AA1wpWufy7jAA286pDlfI-,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,@@Gotit4thaaL0 oh cry me a river and cope,2024-10-26T03:30:27Z,0,3085,hBMoPUAeLnY
2,5150,UgxwTlPCgMq9kjtAeqh4AaABAg.AA1yuIj8apfAA2FenW9Qg3,UCYyZgOXCxYphMpscM6akNLQ,@Pierre-LucBeauregard,Would be the prank of the century if all this ...,2024-10-26T04:36:23Z,0,5016,hBMoPUAeLnY
3,8270,UgzuaFTrXo8Xqx1EKC54AaABAg.AA3WnpuL23dAA5SNL5H8AU,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,Awesome,2024-10-27T10:25:11Z,0,7718,hBMoPUAeLnY
4,21640,Ugy_sasClLMRpu8vbbt4AaABAg.A9MVfxBbmtyA9NIG5tuRba,UCpYgvVY3GgT6qSqZQfCXFiw,@sneakytacos773,Barstool boys have their firsts balled 😂🫵,2024-10-09T12:11:45Z,0,21637,_KCRsjPCiCI
...,...,...,...,...,...,...,...,...,...
64,53368,Ugzv7w6DFlCjA7SSCc94AaABAg.A8bXwFBse_AA8c7LhycPBo,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,@@samurphyyour mistaking trump for Biden and K...,2024-09-20T19:12:51Z,1,53361,u9-cjwpthz4
65,53369,Ugzv7w6DFlCjA7SSCc94AaABAg.A8bXwFBse_AA8c7Nd1Lh26,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,She’s a reptile,2024-09-20T19:13:07Z,0,53361,u9-cjwpthz4
66,53371,Ugzv7w6DFlCjA7SSCc94AaABAg.A8bXwFBse_AA8d6mLGcJ30,UCMlPpk8WyoYwxX2XT8_m01A,@denisn8336,@@friedhotwings nope I’m right\nCope,2024-09-21T04:27:07Z,0,53361,u9-cjwpthz4
67,53689,UgwQNeoPRcjVFxCyD214AaABAg.A8bezSg-Z8QA8btBFMmAlq,UCQmjnOHBtz0J3Nm_YU62nPQ,@effyiew7318,Lol Imagine thinking this. The fact that you ...,2024-09-20T17:00:21Z,0,53684,u9-cjwpthz4


In [None]:
# Next:
# 1. Create another Table for the database that contains the edges, by checking for every comment if it's a reply and to whom (check parent_id as well as the content)
# 2. Read the Table, create a Graph, visualize and see how it is

