# Dependencies

In [1]:
# !pip install google-api-python-client spacy pandas tqdm
# !python -m spacy download en_core_web_sm

# Imports

In [2]:
import googleapiclient.discovery
import googleapiclient.errors
import pandas as pd
import re
from tqdm import tqdm
from urllib.parse import urlparse, parse_qs
import os
from dotenv import load_dotenv
from datetime import datetime
from tqdm import tqdm
from transformers import AutoTokenizer

ModuleNotFoundError: No module named 'googleapiclient'

In [None]:
load_dotenv()
DEVELOPER_KEY_1 = os.getenv("API_KEY")
DEVELOPER_KEY_1 = os.getenv("API_KEY_RESERVED")

youtube = googleapiclient.discovery.build(
    "youtube", "v3", developerKey=DEVELOPER_KEY_1
)


In [None]:
MAX_COMMENT_TOKENS = 300
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    trust_remote_code=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

# Load and preprocess

In [None]:
def make_full_prompt(comment: str) -> str:
    prompt = f"""Classify the stance of this comment about Ukraine-Russia conflict.

Return ONLY one word: prorussian, neutral, or proukrainian

Guidelines:
- prorussian: supports Russia, criticizes Ukraine/Zelenskyy, justifies invasion
- proukrainian: supports Ukraine, criticizes Russia/Putin, condemns invasion
- neutral: balanced view, no clear stance, or discusses both sides equally

Consider sarcasm and irony when present.

Examples:
"Slava Ukraini!" → proukrainian
"Putin is a great leader" → prorussian
"'Great strategist' Putin lost again lol" → proukrainian
"War hurts everyone" → neutral
"Zelenskyy sells Ukraine to NATO" → prorussian

Comment: "{comment}"

Classification:"""
    messages = [{"role": "user", "content": prompt}]
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

MAX_TOTAL_TOKENS = 512

def truncate_for_prompt(comment: str) -> str:
    low, high = 0, len(comment)
    best = comment
    while low <= high:
        mid = (low + high) // 2
        sub = comment[:mid]
        full_prompt = make_full_prompt(sub)
        n_tokens = len(tokenizer.encode(full_prompt))
        if n_tokens <= MAX_TOTAL_TOKENS:
            best = sub
            low = mid + 1
        else:
            high = mid - 1
    return best

In [None]:
def extract_video_id(raw: str) -> str | None:
    raw = raw.strip()

    if raw.startswith("http://") or raw.startswith("https://"):
        url = urlparse(raw)

        if "youtu.be" in url.hostname:
            vid = url.path.lstrip("/")
        else:
            qs = parse_qs(url.query)
            vid = qs.get("v", [None])[0]

        if vid and "&" in vid:
            vid = vid.split("&")[0]
        return vid

    if "&" in raw:
        raw = raw.split("&")[0]

    if "watch?v=" in raw:
        raw = raw.split("watch?v=")[1]

    return raw

In [None]:
raw_lines = [line.strip() for line in open("data/video_ids_1.txt") if line.strip()]
# raw_lines = [line.strip() for line in open("data/video_ids_2.txt") if line.strip()]
video_ids = []

for line in raw_lines:
    vid = extract_video_id(line)
    if vid and vid not in video_ids:
        video_ids.append(vid)

print(video_ids)

['rXcJZAdcvjw', 'pn4qmgywvtQ', '_J83Lt6jyUc', 'ZrqnVTWS7mc', 'Miv8sjZ4mYc', 'qmHh-NWf6ws', 'ufAWgVGS_lc', '8nzTet1ggZQ', 'QIAS-4crwgU', 'Tmz4rnD68wY', 'JwlVmvRp4Go', 'SNKltLQ-Zv8', 'TltZQDB-Ql8', '3g7VNsFGNNw', '_UvknbtULms', 'GP9RvaOR_Lg', 'bMjfG_OMa3A', 'Vr4fGL-YUVQ', 'PxA-l5rS4GQ', '-3vfVf5DrqI', 'Ka-HJFRMXsc', '1XgaslZBSCM', '2hxNHnXpahg', 'zK1Kboq2_00', '4xcyiZJvfno', '3657IsNuuI0', 'DcPODJ3woAs', 'oF4i7x2hyFQ', 'myIdliK1YMI', 'Z9wLQJS2GJs', 'Ai1xu62yBAE', 'H-NIBHEdhMA', 'WtIiNXw5tXM', 'cY5krhFMJCU', '0LpyyCT_FCU', 'X-x5C2Iajuk', '5R6lZwRVgA8', '5PEIlusom9I', 'MV4Kz8TDP5I', 'QIsq1_0JOug', 'MDkO4crYVHE', 'r89mwXdCS8o', 'bYWtmCS9fLU', 'ljMvbmmOigw', '-S4wbbSOyCI', 't9ZgPWg8bfA', 'o0u8-hTZMss', 'OaZQcUseI3E', 'RWJaUAjSWS4', 'VjEEs4RTec8', 'MRx-KaRjw1s', 'Wb7WpyaYtsU', 'eIz7M9CAdiU', '0-fnvdSsXcc', 'hSmnq9G9m38', 'w-dc4kyfkBY', 'gPlWn17cseQ', 'KeB0zpP-yQ0', 'X-gaWuutg8s', 'oqceQyUPts8', 'awq0gvbNWsA', 'Cv8_KzTNJ30', '47XITR7MLXU', '18IXagJA4Qg', 'xjtYNvwsYGg', 'kmzKY6r7e2I', 'LmzpQo79

In [None]:
def clean_text(t):
    t = re.sub(r"http\S+", "", t)
    t = re.sub(r"[^a-zA-Z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


In [None]:
def get_video_metadata(video_id):
    """
    Returns (title, published_at)
    """
    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        items = response.get("items", [])
        if not items:
            return "UNKNOWN TITLE", None

        snip = items[0]["snippet"]
        title = snip["title"]
        published = snip.get("publishedAt")

        return title, published

    except Exception:
        return "UNKNOWN TITLE", None

In [None]:
def get_all_comments(video_id):
    """
    Returns list of dicts:
    [
      {"text": "...", "publishedAt": "..."},
      ...
    ]
    """
    comments = []
    next_page = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page,
                textFormat="plainText",
                order="time"
            )
            response = request.execute()
        except Exception:
            break

        for item in response.get("items", []):
            snip = item["snippet"]["topLevelComment"]["snippet"]
            comments.append({
                "text": snip["textDisplay"],
                "publishedAt": snip.get("publishedAt")
            })

        next_page = response.get("nextPageToken")
        if not next_page:
            break

    return comments

In [None]:
def normalize_date(date_str):
    if date_str is None:
        return "UNKNOWN"
    try:
        dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
        return dt.date().isoformat()
    except:
        return "UNKNOWN"

In [None]:
rows = []

for vid in tqdm(video_ids):
    title, video_published_at = get_video_metadata(vid)
    comment_items = get_all_comments(vid)

    for c in comment_items:
        raw_comment = c["text"]
        comment = truncate_for_prompt(raw_comment)

        rows.append({
            "id": vid,
            "video_title": title,
            "video_published_at": video_published_at,
            "comment": comment,
        })

  3%|▎         | 25/775 [00:53<26:57,  2.16s/it]


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(rows)

In [None]:
df = df[df["comment"].notna()]
df = df[df["comment"].str.strip() != ""]

In [None]:
df.to_csv("data/youtube_comments_1.csv", index=False)

In [None]:
df.shape

(760432, 4)

In [None]:
def normalize_date(date_str):
    if date_str is None:
        return "UNKNOWN"
    try:
        dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
        return dt.date().isoformat()
    except:
        return "UNKNOWN"

In [None]:
df.describe()

Unnamed: 0,id,video_title,video_published_at,comment
count,1554327,1554327,1554327,1554327
unique,1505,1496,657,1553304
top,a9-Dz7RgeuQ,Putin comments on Trump charges,2023-09-13,>>>>>>>> Increase Your Satisfaction By Underst...
freq,31707,31707,33470,14


In [None]:
# df = pd.concat([df1, df2], ignore_index=True)
df = df.sort_values("video_published_at", ascending=True).reset_index(drop=True)
df['video_published_at'] = df['video_published_at'].apply(lambda x: normalize_date(x))

In [None]:
df.head()

Unnamed: 0,id,video_title,video_published_at,comment
0,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Bro russia isn t the bad guy bro I don t belie...
1,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Dam wars
2,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Well I think Russia and China are gonna target...
3,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,I remember another autocratic leader that thou...
4,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Russia always planned on taking Moldova after ...


In [None]:
df.shape

(1615470, 4)

In [None]:
df = df[df['video_published_at'] != 'UNKNOWN']
df = df[df['video_title'] != 'UNKNOWN TITLE']
df = df[df['video_published_at'].notna()]
df = df[df['video_title'].notna()]
df = df[df['comment'].notna()]
df = df[df['comment'].str.strip() != '']

df = df.drop_duplicates(subset=['comment']).reset_index(drop=True)

In [None]:
df.shape

(1494017, 4)

In [None]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,id,video_title,video_published_at,comment
0,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Bro russia isn t the bad guy bro I don t belie...
1,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Well I think Russia and China are gonna target...
2,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,I remember another autocratic leader that thou...
3,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,Russia always planned on taking Moldova after ...
4,JmuWnsRdTjI,Worry spreads about country Putin may target next,2023-02-19,I hope Putin invades one or several more count...


In [None]:
df.to_csv('data/youtube_comments.csv', index=False)

In [None]:
df.tail()

Unnamed: 0,id,video_title,video_published_at,comment
1494012,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,UKRAINE CRIMEA
1494013,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,The arrogance of Putin to boldly claim that Ru...
1494014,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,India loves Russia a true friend
1494015,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,where are all the idiots who claimed russia in...
1494016,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,And then Putin will stop attacking and murderi...


In [36]:
import pandas as pd
df = pd.read_csv('data/youtube_comments.csv')
df = df.sample(n=len(df)//2, random_state=42)

In [37]:
df.describe()

Unnamed: 0,id,video_title,video_published_at,comment
count,747008,747008,747008,747008
unique,1511,1502,657,747008
top,a9-Dz7RgeuQ,Putin comments on Trump charges,2023-09-13,Don t Blame a religion of Billions for the act...
freq,14945,14945,15783,1


In [38]:
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

In [39]:
def get_prompt_overhead():
    empty_prompt = """Classify stance: prorussian, proukrainian, neutral

Examples:
"Slava Ukraini!" → proukrainian
"Russia is liberating Donbas" → prorussian
"War is terrible for both sides" → neutral

Comment:

Answer:"""
    messages = [{"role": "user", "content": empty_prompt}]
    full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return len(tokenizer.encode(full_prompt))


In [40]:
MAX_TOKENS = 512
prompt_overhead = get_prompt_overhead()
max_comment_tokens = MAX_TOKENS - prompt_overhead

print(f"Max tokens: {MAX_TOKENS}")
print(f"Prompt overhead: {prompt_overhead}")
print(f"Available for comment: {max_comment_tokens}")

Max tokens: 512
Prompt overhead: 88
Available for comment: 424


In [41]:
def truncate_comment(comment):
    if pd.isna(comment):
        return ""
    
    comment_tokens = tokenizer.encode(str(comment), add_special_tokens=False)
    
    if len(comment_tokens) > max_comment_tokens:
        comment_tokens = comment_tokens[:max_comment_tokens]
    
    return tokenizer.decode(comment_tokens, skip_special_tokens=True)

In [42]:
df['comment'] = df['comment'].apply(lambda x: truncate_comment(x))

In [43]:
df.head()

Unnamed: 0,id,video_title,video_published_at,comment
909178,biZ4Ybjp77Q,Officials: ‘Militants’ attack synagogues and c...,2024-06-23,Don t Blame a religion of Billions for the act...
263234,DrMoBUlIHqI,See the alleged drone attack video Putin is ca...,2023-05-03,Sounds like Leo is trying to give out a few ti...
1110791,gLFDvU44B-s,Zelensky postpones trip to Saudi Arabia amid U...,2025-02-18,Well you don t want to tell him to act too ups...
272324,mpFOnFOFSio,What Wagner leader’s message may tell us about...,2023-05-06,My opinion is just take Russia off earth delet...
738280,2OCnNshhqZo,Wife of jailed Putin critic responds to Tucker...,2024-02-08,Karmurza is not a journalist he is a traitor w...


In [44]:
df = df.sort_values("video_published_at", ascending=True).reset_index(drop=True)

In [45]:
df_reset = df.reset_index(drop=True)
df.tail()

Unnamed: 0,id,video_title,video_published_at,comment
747003,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,I predict a nuclear strike to make the land in...
747004,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,By any SACRIFICE translation from ruSSian krem...
747005,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,He must be destroyed
747006,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,Count Drakul in the thumbnail
747007,rXcJZAdcvjw,Putin vows Russia will seize Donbas region by ...,2025-12-04,free Palestine stop genocide fuck CNN


In [46]:
df.to_csv("data/youtube_comments.csv", index=False)