# Dependencies

In [146]:
# !pip install google-api-python-client spacy pandas tqdm
# !python -m spacy download en_core_web_sm

# Imports

In [147]:
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd
import spacy
import re
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
import os
from dotenv import load_dotenv

nlp = spacy.load("en_core_web_sm")

load_dotenv()
DEVELOPER_KEY = os.getenv("API_KEY")

youtube = googleapiclient.discovery.build(
    "youtube", "v3", developerKey=DEVELOPER_KEY
)

# Load and preprocess

In [148]:
def extract_video_id(raw: str) -> str | None:
    raw = raw.strip()

    if raw.startswith("http://") or raw.startswith("https://"):
        url = urlparse(raw)

        if "youtu.be" in url.hostname:
            vid = url.path.lstrip("/")
        else:
            qs = parse_qs(url.query)
            vid = qs.get("v", [None])[0]

        if vid and "&" in vid:
            vid = vid.split("&")[0]
        return vid

    if "&" in raw:
        raw = raw.split("&")[0]

    if "watch?v=" in raw:
        raw = raw.split("watch?v=")[1]

    return raw

In [149]:
raw_lines = [line.strip() for line in open("data/data_november2025-24.txt") if line.strip()]
video_ids = []

for line in raw_lines:
    vid = extract_video_id(line)
    if vid and vid not in video_ids:
        video_ids.append(vid)

print(video_ids)

['ljMvbmmOigw', '9OmBd0GNpbk', '###########1', 'PbgVumaLCl0', 'KeB0zpP-yQ0', 'xjtYNvwsYGg', 'dmaDI8V_aFE', '##########2', 'OatwxrKxz_Y', '_w_2bbZ5FF0', '3f52N1pJ9as', '##########3', 'an4NbhFV9JQ', 'JhzLlCZs3ys', 'Id5XsIRneaE', '###########4', 'FP1YbhJ3jqY', 'hqZ4tbZjNrA', 'KNkjveGaFj8', 'Gus0SfUSGyM', '###########5', '8wIk-FMsXb8', 'MNyjCdH3JMY', '0js0eitf3hI', '##########6', '3Y2nyBD0yLU', '6mdaIbkX4UZWjVpD', 'lqdDjg2_knS2nulV', '###########7', 'r9ARwvDDBg1tV5Np', 'zKT16IZSNlcJlU2Z', 'v23uecZmzeelfq5D', '###########8', 'CZHES0yRnP2s0vBB', 'XoEAxDuDMwTUoS-D', 'moXT1HQM-4egP9d8', 'UKhWqvzylCHIKqR9', '###########9', 'M0vlSKbR1nnQk7Wh', '30Uuzh1ZH3ZndWWY', '08FrwnW3L5iqo1Te', '###########10', 'AthO1orK9AcyqiKL', 'srtTKcmWYwiL38-Q', '###########11', 'Ftafhme8jDpsc2NF', 'Oy-dGo2ICnOSBiVE', 'uLl5GnsHsgrF0SA-', '###########12', 'a_qswLd7gr9GPXiw', 'GLcEHOmWDWh5Xasb', '6S9mfL7SOOMVut88']


In [150]:
def clean_text(t):
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def lemmatize(text):
    doc = nlp(text)
    return " ".join([tok.lemma_ for tok in doc if tok.is_alpha])

In [151]:
def get_video_metadata(video_id):
    """
    Returns (title, published_at)
    """
    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        items = response.get("items", [])
        if not items:
            return "UNKNOWN TITLE", None

        snip = items[0]["snippet"]
        title = snip["title"]
        published = snip.get("publishedAt")

        return title, published

    except Exception:
        return "UNKNOWN TITLE", None

In [152]:
def get_all_comments(video_id):
    """
    Returns list of dicts:
    [
      {"text": "...", "publishedAt": "..."},
      ...
    ]
    """
    comments = []
    next_page = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page,
                textFormat="plainText",
                order="time"
            )
            response = request.execute()
        except Exception:
            break

        for item in response.get("items", []):
            snip = item["snippet"]["topLevelComment"]["snippet"]
            comments.append({
                "text": snip["textDisplay"],
                "publishedAt": snip.get("publishedAt")
            })

        next_page = response.get("nextPageToken")
        if not next_page:
            break

    return comments

In [153]:
def normalize_date(date_str):
    from datetime import datetime
    dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
    return dt.date().isoformat()

In [154]:
rows = []

for vid in tqdm(video_ids):
    title, video_published_at = get_video_metadata(vid)
    comment_items = get_all_comments(vid)

    for c in comment_items:
        comment = c["text"]

        cleaned = clean_text(comment)
        # lemma = lemmatize(cleaned)

        rows.append({
            "id": vid,
            "video_title": title,
            "video_published_at": normalize_date(video_published_at),
            "comment": cleaned,
            # "lemma_comment": lemma
        })

  0%|          | 0/52 [00:00<?, ?it/s]

100%|██████████| 52/52 [00:25<00:00,  2.05it/s]


In [155]:
df = pd.DataFrame(rows)
df = df.sort_values("video_published_at", ascending=True).reset_index(drop=True)

In [156]:
df = df[df["comment"].notna()]
df = df[df["comment"].str.strip() != ""]

In [157]:
df.to_csv("data/youtube_comments_with_dates.csv", index=False)
df.head()

Unnamed: 0,id,video_title,video_published_at,comment
0,3Y2nyBD0yLU,Trump warns of 'decisions' that may need to be...,2025-05-07,Without a shadow of a doubt Donald J Trump wil...
1,3Y2nyBD0yLU,Trump warns of 'decisions' that may need to be...,2025-05-07,These things are out of the hands of any polit...
2,3Y2nyBD0yLU,Trump warns of 'decisions' that may need to be...,2025-05-07,Great president Trump The ultimate mr DumDum
3,3Y2nyBD0yLU,Trump warns of 'decisions' that may need to be...,2025-05-07,Putin owns traitor tRUMP and so this is all an...
4,3Y2nyBD0yLU,Trump warns of 'decisions' that may need to be...,2025-05-07,Trumps physical condition looks to be worsenin...


In [158]:
df.shape

(13796, 4)