## Getting API Keys

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

REDDIT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")

## 📦 Install required libraries

In [2]:
# 📚 Imports
import praw
import pandas as pd
import re
import random
import time
from datetime import datetime, timedelta
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 🔑 Reddit API Setup (Read-only mode)

In [3]:
reddit = praw.Reddit(
    client_id=REDDIT_ID,
    client_secret=REDDIT_SECRET,
    user_agent="youtube_to_reddit_sentiment"
)

## 📥 Load video titles


In [4]:
youtube_df = pd.read_csv("../data/youtube_data.csv")

# 🕰️ Optional: Calculate Reddit 'time_filter' based on YouTube’s oldest timestamp
# We'll mock it as 'month' or 'week' here (PRAW only allows fixed ranges)
time_filter = "month"

## 🧹 Clean text

In [5]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower()

## 📊 Sentiment analysis


In [6]:
analyzer = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

In [7]:
# 😃 Convert score to emoji
def sentiment_emoji(score):
    if score >= 0.5:
        return "😃"
    elif score <= -0.5:
        return "😠"
    else:
        return "😐"

## 🧠 Reddit Post + Comment collector


In [8]:
%%time
all_data = []

for topic in youtube_df["video_title"].unique():
    posts = reddit.subreddit("all").search(query=topic, limit=5, time_filter=time_filter)

    for post in posts:
        post_data = {
            "youtube_title": topic,
            "reddit_post_title": post.title,
            "post_score": post.score,
            "post_url": post.url,
            "post_created": pd.to_datetime(post.created_utc, unit="s"),
            "post_sentiment": analyze_sentiment(clean_text(post.title)),
            "post_sentiment_emoji": sentiment_emoji(analyze_sentiment(clean_text(post.title))),
        }

        # ⛓️ Add top 3 comments per post
        post.comments.replace_more(limit=0)
        comments = post.comments[:3]
        for comment in comments:
            cleaned = clean_text(comment.body)
            all_data.append({
                **post_data,
                "comment": comment.body,
                "cleaned_comment": cleaned,
                "comment_sentiment": analyze_sentiment(cleaned),
                "comment_sentiment_emoji": sentiment_emoji(analyze_sentiment(cleaned)),
                "comment_author": str(comment.author),
                "comment_score": comment.score
            })

CPU times: user 311 ms, sys: 16.2 ms, total: 327 ms
Wall time: 14.9 s


## Analyzing the DataFrame

In [9]:
df = pd.DataFrame(all_data)

In [10]:
df

Unnamed: 0,youtube_title,reddit_post_title,post_score,post_url,post_created,post_sentiment,post_sentiment_emoji,comment,cleaned_comment,comment_sentiment,comment_sentiment_emoji,comment_author,comment_score
0,Brawl Talk: A NEW BRAWLER RARITY?!,A NEW BRAWLER RARITY?! Brawl Talk is tomorrow!,2813,https://i.redd.it/qie7r1pu3lve1.jpeg,2025-04-18 12:08:41,0.0,😐,General reminder for subreddit members: Simple...,general reminder for subreddit members simple ...,0.5859,😃,AutoModerator,1
1,Brawl Talk: A NEW BRAWLER RARITY?!,A NEW BRAWLER RARITY?! Brawl Talk is tomorrow!,2813,https://i.redd.it/qie7r1pu3lve1.jpeg,2025-04-18 12:08:41,0.0,😐,RYAN IS BACK BABY,ryan is back baby,0.0000,😐,Exciting-Year-2343,1071
2,Brawl Talk: A NEW BRAWLER RARITY?!,A NEW BRAWLER RARITY?! Brawl Talk is tomorrow!,2813,https://i.redd.it/qie7r1pu3lve1.jpeg,2025-04-18 12:08:41,0.0,😐,the silhuette of the prawler is probably in en...,the silhuette of the prawler is probably in en...,0.7783,😃,gamer_withnolife,614
3,Brawl Talk: A NEW BRAWLER RARITY?!,BRAWL TALK LIVE DICUSSION: Talk about the new ...,11,https://www.reddit.com/gallery/1k2y8ba,2025-04-19 15:01:59,0.0,😐,ultra legendary. jesus. \*frick.\*\n\nEDIT: I'...,ultra legendary jesus frick\n\nedit im scared ...,-0.6697,😠,Capital-Ad3018,12
4,Brawl Talk: A NEW BRAWLER RARITY?!,BRAWL TALK LIVE DICUSSION: Talk about the new ...,11,https://www.reddit.com/gallery/1k2y8ba,2025-04-19 15:01:59,0.0,😐,They fucked up with the new rarity,they fucked up with the new rarity,-0.6597,😠,Alive-Skeleton,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,"Remontada épica. Abajo por 2 goles, el Barcelo...",[Match Thread] La Liga: Barcelona x Celta,2,https://www.reddit.com/r/futebol/comments/1k2y...,2025-04-19 15:11:37,0.0,😐,Pedri tentando fazer uma única partida ruim(im...,pedri tentando fazer uma nica partida ruimimpo...,0.0000,😐,LE__guardian,2
57,"Remontada épica. Abajo por 2 goles, el Barcelo...",Resumen de la jornada 25 de la ACB,1,https://www.reddit.com/r/NBAenEspanol/comments...,2025-03-31 03:41:26,0.0,😐,El Leyma Coruña creo que necesitaría acumular ...,el leyma corua creo que necesitara acumular ta...,-0.5267,😠,Galego_nativo,2
58,"Remontada épica. Abajo por 2 goles, el Barcelo...",Resumen de la jornada 25 de la ACB,1,https://www.reddit.com/r/NBAenEspanol/comments...,2025-03-31 03:41:26,0.0,😐,"Victoria del Madrid q pudo rotar, igualado ha...",victoria del madrid q pudo rotar igualado has...,-0.7003,😠,Fun-Equipment-575,2
59,"Remontada épica. Abajo por 2 goles, el Barcelo...",Resumen de la semana en la Euroliga (jornadas ...,0,https://www.reddit.com/r/NBAenEspanol/comments...,2025-03-29 00:40:52,0.0,😐,Parece que Usman Garuba ha recuperado su mejor...,parece que usman garuba ha recuperado su mejor...,-0.2960,😐,Galego_nativo,1


## Checking Downvoted Comments

In [11]:
df[df["comment_score"] < 0][["comment", "comment_score"]]

for comment, comment_score in zip(df["comment"], df["comment_score"]):
    if comment_score < 0:
        print(f"Comment:\n{comment}", f"Comment Score:\n{comment_score}", sep="\n\n")

## 💾 Save to CSV

In [12]:
df.to_csv("../data/reddit_data.csv", index=False)