# Dependencies

In [41]:
!pip install google-api-python-client spacy pandas tqdm
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Imports

In [42]:
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd
import spacy
import re
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs

nlp = spacy.load("en_core_web_sm")

DEVELOPER_KEY = "AIzaSyD4_r0qCerO0Zc2kIpwbqY0H9NiUPBMHdI"

youtube = googleapiclient.discovery.build(
    "youtube", "v3", developerKey=DEVELOPER_KEY
)

# Load and preprocess

In [43]:
def extract_video_id(raw: str) -> str | None:
    raw = raw.strip()

    if raw.startswith("http://") or raw.startswith("https://"):
        url = urlparse(raw)

        if "youtu.be" in url.hostname:
            vid = url.path.lstrip("/")
        else:
            qs = parse_qs(url.query)
            vid = qs.get("v", [None])[0]

        if vid and "&" in vid:
            vid = vid.split("&")[0]
        return vid

    if "&" in raw:
        raw = raw.split("&")[0]

    if "watch?v=" in raw:
        raw = raw.split("watch?v=")[1]

    return raw

In [44]:
raw_lines = [line.strip() for line in open("data_november2025-24.txt") if line.strip()]
video_ids = []

for line in raw_lines:
    vid = extract_video_id(line)
    if vid and vid not in video_ids:
        video_ids.append(vid)

print(video_ids)

['ljMvbmmOigw', '9OmBd0GNpbk', 'PbgVumaLCl0', 'KeB0zpP-yQ0', 'xjtYNvwsYGg', 'dmaDI8V_aFE', 'OatwxrKxz_Y', '_w_2bbZ5FF0', '3f52N1pJ9as', 'an4NbhFV9JQ', 'JhzLlCZs3ys', 'Id5XsIRneaE', 'FP1YbhJ3jqY', 'hqZ4tbZjNrA', 'KNkjveGaFj8', 'Gus0SfUSGyM', '8wIk-FMsXb8', 'MNyjCdH3JMY', '0js0eitf3hI', '3Y2nyBD0yLU', '6mdaIbkX4UZWjVpD', 'lqdDjg2_knS2nulV', 'r9ARwvDDBg1tV5Np', 'zKT16IZSNlcJlU2Z', 'v23uecZmzeelfq5D', 'CZHES0yRnP2s0vBB', 'XoEAxDuDMwTUoS-D', 'moXT1HQM-4egP9d8', 'UKhWqvzylCHIKqR9', 'M0vlSKbR1nnQk7Wh', '30Uuzh1ZH3ZndWWY', '08FrwnW3L5iqo1Te', 'AthO1orK9AcyqiKL', 'srtTKcmWYwiL38-Q', 'Ftafhme8jDpsc2NF', 'Oy-dGo2ICnOSBiVE', 'uLl5GnsHsgrF0SA-', 'a_qswLd7gr9GPXiw', 'GLcEHOmWDWh5Xasb', '6S9mfL7SOOMVut88']


In [45]:
def clean_text(text):
    t = text.lower()
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"[^a-z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def lemmatize(text):
    doc = nlp(text)
    return " ".join([tok.lemma_ for tok in doc if tok.is_alpha])

In [46]:
def get_video_metadata(video_id):
    """
    Returns (title, published_at)
    """
    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        items = response.get("items", [])
        if not items:
            return "UNKNOWN TITLE", None

        snip = items[0]["snippet"]
        title = snip["title"]
        published = snip.get("publishedAt")

        return title, published

    except Exception:
        return "UNKNOWN TITLE", None

In [47]:
def get_all_comments(video_id):
    """
    Returns list of dicts:
    [
      {"text": "...", "publishedAt": "..."},
      ...
    ]
    """
    comments = []
    next_page = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page,
                textFormat="plainText"
            )
            response = request.execute()
        except Exception:
            break

        for item in response.get("items", []):
            snip = item["snippet"]["topLevelComment"]["snippet"]
            comments.append({
                "text": snip["textDisplay"],
                "publishedAt": snip.get("publishedAt")
            })

        next_page = response.get("nextPageToken")
        if not next_page:
            break

    return comments

In [48]:
rows = []

for vid in tqdm(video_ids):
    url = f"https://www.youtube.com/watch?v={vid}"

    title, video_published_at = get_video_metadata(vid)
    comment_items = get_all_comments(vid)

    for c in comment_items:
        comment = c["text"]
        comment_time = c["publishedAt"]

        cleaned = clean_text(comment)
        lemma = lemmatize(cleaned)

        rows.append({
            "id": vid,
            "video_title": title,
            "url": url,
            "video_published_at": video_published_at,
            "comment": comment,
            "comment_published_at": comment_time,
            "lemma_comment": lemma
        })

100%|██████████| 40/40 [02:01<00:00,  3.04s/it]


In [49]:
df = pd.DataFrame(rows)
df.to_csv("youtube_comments_with_dates.csv", index=False)
df.head()

Unnamed: 0,id,video_title,url,video_published_at,comment,comment_published_at,lemma_comment
0,ljMvbmmOigw,Military expert reacts to intensified fighting...,https://www.youtube.com/watch?v=ljMvbmmOigw,2025-11-03T15:15:07Z,Not much if an expert- he lies toi much and is...,2025-11-24T04:03:13Z,not much if an expert he lie toi much and be a...
1,ljMvbmmOigw,Military expert reacts to intensified fighting...,https://www.youtube.com/watch?v=ljMvbmmOigw,2025-11-03T15:15:07Z,Russia liberated 240 sq km of territory in Sep...,2025-11-24T03:55:20Z,russia liberate sq km of territory in sept sq ...
2,ljMvbmmOigw,Military expert reacts to intensified fighting...,https://www.youtube.com/watch?v=ljMvbmmOigw,2025-11-03T15:15:07Z,"Forget Trump, move on without him. Grab a toma...",2025-11-22T03:51:39Z,forget trump move on without he grab a tomahaw...
3,ljMvbmmOigw,Military expert reacts to intensified fighting...,https://www.youtube.com/watch?v=ljMvbmmOigw,2025-11-03T15:15:07Z,"Hij is misschien militair deskundige, maar zek...",2025-11-14T11:16:51Z,hij be misschien militair deskundige maar zeke...
4,ljMvbmmOigw,Military expert reacts to intensified fighting...,https://www.youtube.com/watch?v=ljMvbmmOigw,2025-11-03T15:15:07Z,worthless news !!,2025-11-14T04:19:47Z,worthless news


In [50]:
df.shape

(14109, 7)