## Getting API Keys

In [19]:
from dotenv import load_dotenv
import os

load_dotenv()

YT_KEY = os.getenv("YOUTUBE_API_KEY")

## Importing Necessary Libraries

In [20]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

## 🔑 Authenticate with YouTube Data API v3

In [21]:
YOUTUBE_API_KEY = YT_KEY
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
analyzer = SentimentIntensityAnalyzer()

## 🔥 Get Trending Videos (from YouTube)

In [22]:
def get_trending_videos(region_code='US', max_results=50):
    request = youtube.videos().list(
        part="snippet,statistics",
        chart="mostPopular",
        regionCode=region_code,
        maxResults=max_results
    )
    response = request.execute()
    videos = [{
        'video_id': item['id'],
        'title': item['snippet']['title'],
        'channel': item['snippet']['channelTitle'],
        'description': item['snippet']['description']
    } for item in response['items']]
    return videos

## 🧠 Extract Transcripts

In [23]:
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return ' '.join([entry['text'] for entry in transcript])
    except:
        return ""

## 💬 Fetch Comments

In [24]:
def get_comments(video_id, max_comments=5):
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_comments,
            textFormat="plainText"
        )
        response = request.execute()
        if 'items' in response:
            return [item['snippet']['topLevelComment']['snippet']['textDisplay'] for item in response['items']]
        else:
            return []
    except HttpError as e:
        if e.resp.status == 403:
            print(f"🚫 Comments disabled for video: {video_id}")
        else:
            print(f"⚠️ Error for video {video_id}: {e}")
        return []

## 🧪 Put It Together

In [25]:
%%time
desired_video_count = 5
collected = 0
data = []

all_videos = get_trending_videos(max_results=50)

for vid in all_videos:
    if collected >= desired_video_count:
        break

    video_id = vid['video_id']
    comments = get_comments(video_id, max_comments=5)

    if not comments:
        continue  # Skip video if no usable comments

    transcript = get_transcript(video_id)
    transcript_sentiment = analyzer.polarity_scores(transcript)['compound']

    video_url = f"https://www.youtube.com/watch?v={video_id}"

    for comment in comments:
        data.append({
            'video_title': vid['title'],
            'channel': vid['channel'],
            'comment': comment,
            'comment_sentiment': analyzer.polarity_scores(comment)['compound'],
            'transcript_sentiment': transcript_sentiment,
            'video_url': video_url  # ✅ added here
        })

    collected += 1

df = pd.DataFrame(data)
df.to_csv("data/youtube_data.csv", index=False)
print("✅ Finished collecting data.")


✅ Finished collecting data.
CPU times: user 100 ms, sys: 8.27 ms, total: 108 ms
Wall time: 3.73 s


In [26]:
df.head()

Unnamed: 0,video_title,channel,comment,comment_sentiment,transcript_sentiment,video_url
0,Brawl Talk: A NEW BRAWLER RARITY?!,Brawl Stars,Is the new brawler legendery?,0.0,0.0,https://www.youtube.com/watch?v=6D1yFt1Uook
1,Brawl Talk: A NEW BRAWLER RARITY?!,Brawl Stars,I want to get the giveaway😢❤,0.34,0.0,https://www.youtube.com/watch?v=6D1yFt1Uook
2,Brawl Talk: A NEW BRAWLER RARITY?!,Brawl Stars,"Came for the thumbnail, stayed for the vibes",0.0,0.0,https://www.youtube.com/watch?v=6D1yFt1Uook
3,Brawl Talk: A NEW BRAWLER RARITY?!,Brawl Stars,11:10,0.0,0.0,https://www.youtube.com/watch?v=6D1yFt1Uook
4,Brawl Talk: A NEW BRAWLER RARITY?!,Brawl Stars,"This was very informative, appreciate it.",0.4549,0.0,https://www.youtube.com/watch?v=6D1yFt1Uook


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   video_title           25 non-null     object 
 1   channel               25 non-null     object 
 2   comment               25 non-null     object 
 3   comment_sentiment     25 non-null     float64
 4   transcript_sentiment  25 non-null     float64
 5   video_url             25 non-null     object 
dtypes: float64(2), object(4)
memory usage: 1.3+ KB


In [28]:
df["video_title"].value_counts()

video_title
Brawl Talk: A NEW BRAWLER RARITY?!                                                                      5
$1 vs $25,000 Build Challenge                                                                           5
Doechii - Anxiety (Official Video)                                                                      5
THE GASLIGHT DISTRICT: PILOT                                                                            5
Remontada épica. Abajo por 2 goles, el Barcelona ganó 4-3 a Celta de Vigo. Raphinha, héroe | La Liga    5
Name: count, dtype: int64

## 💾 Save to CSV

In [29]:
os.makedirs("data", exist_ok=True)
df.to_csv("data/youtube_data.csv", index=False)

### 🔮 Suggested Combined Analysis for Master Notebook
**Once Reddit and YouTube data are merged:**

✅ Sentiment Comparison  
Compare Reddit vs. YouTube sentiment for the same topic.

Highlight divergence between video vs. comments sentiment.

✅ Engagement Analysis  
Plot word count vs. sentiment.  

Comment likes vs. sentiment score (for YouTube).  

✅ Timeline Tracking  
If timestamped, see how sentiment evolves over time across platforms.  

✅ Word Cloud or Topic Modeling  
Extract common themes using LDA or nltk.FreqDist.