In [19]:
import pandas as pd
import csv
import json
import re
import aiohttp
import asyncio
import google.generativeai as genai
import time

In [None]:
'''
- Multiple API Keys are being used to enhance performance by circumventing the rate limit of Gemini's API.
- Each API key has a rate limit of 10 requests per minute. By rotating between multiple keys, we can make more requests in a given time period.
- This ensures that our system can continue processing efficiently without hitting the rate limit too frequently.
- You can still use only 1 API key for both Gemini and YouTube each if you handle the rate limiting and rotation accordingly.
- While the system is optimized for multiple Gemini API keys, it will also function with a single API key for both YouTube and Gemini, though you may encounter rate limiting if the volume of requests is high.
'''
gemini_api = 'your_gemini_api'
gemini_api_2 = 'optional_gemini_api'
gemini_api_3 = 'optional_gemini_api'
gemini_api_4 = 'optional_gemini_api'
gemini_api_5 = 'optional_gemini_api'
youtube_api = 'your_youtube_api'
youtube_api_2 = 'optional_youtube_api'
youtube_api_3 = 'optional_youtube_api'
BASE_URL = "https://www.googleapis.com/youtube/v3"

In [4]:
async def fetch_videos(region_code, category_id, max_results=10, sort_by='relevance'):
    url = (f"{BASE_URL}/videos?chart=mostPopular&regionCode={region_code}"
           f"&videoCategoryId={category_id}&key={youtube_api}&maxResults={max_results}&order={sort_by}")
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                return await response.json()
            return None

In [5]:
async def fetch_videos_pagination(region_code, category_id, max_results=50, sort_by='viewCount', total_results=100):
    all_videos = []
    next_page_token = None
    total_pages = total_results // max_results
    for page in range(total_pages):
        url = (f"{BASE_URL}/videos?chart=mostPopular&regionCode={region_code}"
               f"&videoCategoryId={category_id}&key={youtube_api}&maxResults={max_results}&order={sort_by}")
        if next_page_token:
            url += f"&pageToken={next_page_token}"
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status == 200:
                    data = await response.json()
                    if 'items' in data:
                        all_videos.extend(data['items'])
                    next_page_token = data.get('nextPageToken')
                    if not next_page_token:
                        break
                else:
                    print(f"Error fetching page {page + 1}: {response.status}")
                    break
    return all_videos

In [6]:
async def fetch_videos_channel(channel_id, max_results=10, sort_by='date'):
    url = (f"{BASE_URL}/search?part=snippet&channelId={channel_id}&type=video"
           f"&maxResults={max_results}&order={sort_by}&key={youtube_api}")
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                return await response.json()
            return None

In [7]:
async def fetch_videos_channel_pag(channel_id, max_results=100, sort_by='date', total_videos=100):
    url = (f"{BASE_URL}/search?part=snippet&channelId={channel_id}&type=video"
           f"&maxResults={max_results}&order={sort_by}&key={youtube_api}")
    videos = []
    next_page_token = None
    async with aiohttp.ClientSession() as session:
        while len(videos) < total_videos:
            paginated_url = url + (f"&pageToken={next_page_token}" if next_page_token else "")
            async with session.get(paginated_url) as response:
                if response.status == 200:
                    data = await response.json()
                    videos.extend(data.get('items', []))
                    next_page_token = data.get('nextPageToken')
                    if not next_page_token:
                        break
                else:
                    print(f"Failed to fetch videos. Status: {response.status}")
                    break
    return videos

In [9]:
async def fetch_video_details(video_id):
    video_details_url = f"{BASE_URL}/videos?part=snippet,statistics,contentDetails&id={video_id}&key={youtube_api}"
    async with aiohttp.ClientSession() as session:
        async with session.get(video_details_url) as response:
            if response.status == 200:
                result = await response.json()
                if result["items"]:
                    video_details = result["items"][0]
                    comment_count = video_details["statistics"].get("commentCount", 0)
                    if int(comment_count) > 0:
                        return video_details
            return None

In [8]:
async def fetch_comments_data(video_id, api, max_results, order='relevance'):
    url = f"{BASE_URL}/commentThreads?part=snippet&videoId={video_id}&key={api}&maxResults={max_results}&order={order}"
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            if response.status == 200:
                return await response.json()
            return None

In [46]:
videos = await fetch_videos_channel('UC8butISFwT-Wl7EV0hUK0BQ', sort_by='relevance')
videos

{'kind': 'youtube#searchListResponse',
 'etag': 't6Rru_HSr7DWHHiWlenLqMMp_Pg',
 'nextPageToken': 'CAoQAA',
 'regionCode': 'US',
 'pageInfo': {'totalResults': 454727, 'resultsPerPage': 10},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'dd6zDS5gG7qH7IKYgGpQmKzVCsg',
   'id': {'kind': 'youtube#video', 'videoId': 'VrSJhTGMM90'},
   'snippet': {'publishedAt': '2023-03-14T16:33:52Z',
    'channelId': 'UC8butISFwT-Wl7EV0hUK0BQ',
    'title': 'What is Python used for?',
    'description': 'Learn Python with this course: https://www.youtube.com/watch?v=eWRfhZUzrAc.',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/VrSJhTGMM90/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/VrSJhTGMM90/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/VrSJhTGMM90/hqdefault.jpg',
      'width': 480,
      'height': 360}},
    'channelTitle': 'freeCodeCamp.org',
    'liveBroadcastConte

In [39]:
toggle = 0
async def analyze_comments(title, comments):
    if not comments or not isinstance(comments, list):
        print("Invalid or empty comments list provided.")
        return None

    prompt = f'''
    You are a sentiment analysis model specialized in news and politics-related content. Your task is to classify the sentiment of each YouTube comment in the following list as "positive", "neutral", or "negative". These are YouTube comments on news or political videos, so consider the tone, emotions, slang, and language commonly used in discussions about current events, opinions, and political issues.

    Only provide the sentiment labels as a list in the **exact same order** as the input comments. Each sentiment should align with how well the comment relates to the video's title, topic, or political stance presented.

    Video Title: "{title}"

    Sentiment classification rules:
    - "Positive": The comment supports, agrees with, or positively aligns with the video title, topic, or political stance presented in the video (e.g., "Great analysis," "This is spot on," "I totally agree").
    - "Negative": The comment disagrees with, criticizes, or negatively aligns with the video title, topic, or political stance presented in the video (e.g., "This is biased," "Completely wrong," "Terrible take").
    - "Neutral": The comment neither supports nor disagrees with the title or content, asks questions, provides unrelated feedback, or is ambiguous (e.g., "What happened next?" "I don't understand this," "Why is this even news?").

    Each comment is numbered for clarity. Follow the exact numbering in the output.
    Input format (comments):
    {comments}

    Response format should be exactly like the following (horizontal format):
    ["Positive", "Neutral", "Negative", ...]

    Not like the following (vertical format):
    ["Positive",
     "Neutral",
     "Negative",
     ...
    ]
    '''
    # In this case, we're rotating between 5 different API keys.
    '''
    Note:
    - This system works with multiple API keys to avoid hitting rate limits imposed by Gemini.
    - If you wish to use just one API key, the toggle system will still work as long as you handle the API key rotation properly.
      In that case, simply use a single API key from the list and you won't need to rotate.
    '''
    global toggle
    api_keys = [gemini_api, gemini_api_2, gemini_api_4, gemini_api_5, gemini_api_3]
    api_key = api_keys[toggle % 5]
    toggle += 1

    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model_name="gemini-2.0-flash-exp")
    # print(f'using api: {api_key}')
    try:
        response = model.generate_content(prompt)
        if response and response.text:
            raw_output = response.text.strip()
            try:
                sentiment_label = re.search(r'\[.*\]', raw_output)
                # print(sentiment_label)
                if sentiment_label:
                    sentiment_labels = json.loads(sentiment_label.group(0))
                else:
                    print(raw_output)
                    print("No sentiment labels found in the response.")
                    return None
                if len(sentiment_labels) == len(comments):
                    print(f'Number of comments Analyzed: {len(comments)}')
                    return sentiment_labels
                else:
                    print(f"Mismatch between number of comments and sentiments. len of comments: {len(comments)}, len of labels: {len(sentiment_labels)}\nRaw output: {raw_output}\n senti label: {sentiment_labels}")
                    return None
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON response: {e}\nRaw output: {raw_output}")
                return None
        else:
            print("Empty or invalid response from Gemini.")
            return None

    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return None

In [14]:
vid = await fetch_comments_data('R4v_7hh4Yys', youtube_api, 25)

In [15]:
vid

{'kind': 'youtube#commentThreadListResponse',
 'etag': 'tm-uLe9IqW1OzWMMtcetGiba7zU',
 'nextPageToken': 'Z2V0X3JhbmtlZF9zdHJlYW1zLS1Dc1FCQ0lBRUZSZTMwVGdhdVFFS3RBRUkyRjhRZ0FRWUJ5S3BBVHRNYmZvSWdSZkRnWEFVdG9MM3VhZWdVcnZ6WENON2czeXFQVy15Z01EOUxvZ3B4LVB1bFMycF9rTjRqQWhqS2s0SjBURGJXZDJnMEluZG9qUjdCazhMdko1cGVZYUdsOTVpTUVCUVFBRVBneW1YTm93eGRGWTJzNU9VRVRXd0RwZUdXdHNiOFBOSFBxWFFYa0RJc1FpV0dtcFRJZE55TFk2b3FTRVVwZ29pRGFMWDFLZVNLUjBERk9VMTBLZ0JDNWFqRlVncWptR1FVNlNjbVFnSGlPb2dKUkFGbFBMYURXd1FHUklGQ0tnZ0dBQVNCUWlJSUJnQUVnVUlpU0FZQUJJSENJY2dFQUVZQUJJSENJVWdFQmtZQVE=',
 'pageInfo': {'totalResults': 25, 'resultsPerPage': 25},
 'items': [{'kind': 'youtube#commentThread',
   'etag': 'QS_mhMGjZvNbyMq4iSEWpm9yTYM',
   'id': 'Ugw6fWSLOwC7bCW4rGN4AaABAg',
   'snippet': {'channelId': 'UC8butISFwT-Wl7EV0hUK0BQ',
    'videoId': 'R4v_7hh4Yys',
    'topLevelComment': {'kind': 'youtube#comment',
     'etag': 'oUdR2-l9FSNYrC_HH0qiLjN0j-w',
     'id': 'Ugw6fWSLOwC7bCW4rGN4AaABAg',
     'snippet': {'channelId': 'UC8b

In [16]:
title = "How to Create a Website – WordPress Tutorial for Beginners 2025"
comments = [item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] for item in vid["items"]]

In [17]:
comment_texts = [f"{i + 1}. {comment}" for i, comment in enumerate(comments)]
comment_texts

['1. Good luck on creating your website!',
 "2. Man you're actually reading my mind 😊",
 "3. Thank you for the (for me) update. Understanding which site does what has helped me a lot!\nIt's been nice to take time and review.",
 '4. 2/4/25 Viewing; very good tutorial with added information. Love it!',
 '5. Great. Your video is clear, well explained and well thought out. Thank you. Though I know to code and have also written  a website in html, I am starting a new website and figured it is time I actually use wordpress. I am doing a patreon integration that works with wordpress and was debating between using their REST API or maybe just finally getting set up with wordpress. I have worried how it might go as I wonder how much customization I will lose, but either way I think it is time I learn wordpress.',
 '6. The FCC GOAT! Thanks for this forreal.',
 '7. 👋Hello from South Africa 🎉',
 '8. Right now when i needed it ❤',
 '9. Thank you for the tutorial. Can this work for an e-commerce sit

In [20]:
com = await analyze_comments(title, comment_texts)
com

len of comments: 25, len of labels: 25


['Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Neutral',
 'Negative',
 'Positive',
 'Positive',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Negative',
 'Negative',
 'Neutral',
 'Positive',
 'Neutral',
 'Positive',
 'Positive',
 'Neutral',
 'Neutral']

In [21]:
# List of region codes
region_codes = [
    "US",  # United States
    "GB",  # United Kingdom
    "CA",  # Canada
    "AU",  # Australia
    "IE",  # Ireland
    "NZ",  # New Zealand
    "ZA",  # South Africa
    "PH",  # Philippines
]

# List of YouTube video categories with unique IDs
video_categories = [
    {"category": "People & Blogs", "id": 22},
    {"category": "Entertainment", "id": 24},
    {"category": "Sports", "id": 17},
    {"category": "How-to & Style", "id": 26},
    {"category": "Autos & Vehicles", "id": 2},
    {"category": "Science & Technology", "id": 28},
    {"category": "News & Politics", "id": 25},
    {"category": "Music", "id": 10},
    {"category": "Pets & Animals", "id": 15},
    {"category": "Travel & Events", "id": 19},
    {"category": "Gaming", "id": 20},
    {"category": "Comedy", "id": 23},
    {"category": "Education", "id": 27},
    {"category": "Nonprofits & Activism", "id": 29},
]

In [22]:
def has_multiple_timestamps(comment):
    timestamps = re.findall(r'\d{1,2}(:\d{2}){1,3}', comment)
    return len(timestamps) > 3

In [23]:
def remove_links(comment):
    return re.sub(r'https?://\S+|www\.\S+', '', comment).strip()

In [27]:
videos = await fetch_videos_channel_pag('UC8butISFwT-Wl7EV0hUK0BQ', sort_by='viewCount')

In [28]:
len(videos)

100

In [29]:
len({video["id"]['videoId'] for video in videos})

100

In [30]:
video_ids = [video["id"]['videoId'] for video in videos]
video_with_none = [await fetch_video_details(video_id) for video_id in video_ids]
video_detailed = [video for video in video_with_none if video is not None]

In [31]:
len([vid for vid in video_detailed if int(vid['statistics']['commentCount']) >= 200])

100

In [32]:
video_detailed[4]

{'kind': 'youtube#video',
 'etag': 'RvnrVZquI5E9d7zMugmgiCvPeNY',
 'id': 'vLnPwxZdW4Y',
 'snippet': {'publishedAt': '2018-08-24T17:11:35Z',
  'channelId': 'UC8butISFwT-Wl7EV0hUK0BQ',
  'title': 'C++ Tutorial for Beginners - Full Course',
  'description': "This course will give you a full introduction into all of the core concepts in C++. \n\nWant more from Mike? He's starting a coding RPG/Bootcamp - https://simulator.dev/\n\n⭐️ Contents ⭐\n⌨️ (0:00:00) Introduction\n⌨️ (0:01:38) Windows Installation\n⌨️ (0:04:54) Mac Installation\n⌨️ (0:08:44) Setup & Hello World\n⌨️ (0:12:29) Drawing a Shape\n⌨️ (0:19:55) Variables\n⌨️ (0:31:43) Data Types\n⌨️ (0:39:15) Working With Strings\n⌨️ (0:49:00) Working With Numbers\n⌨️ (0:59:41) Getting User Input\n⌨️ (1:05:32) Building a Calculator\n⌨️ (1:09:28) Building a Mad Libs\n⌨️ (1:13:45) Arrays\n⌨️ (1:20:03) Functions\n⌨️ (1:29:47) Return Statement\n⌨️ (1:35:22) If Statements\n⌨️ (1:47:15) If Statements (con't)\n⌨️ (1:55:58) Building a Better Calcul

In [33]:
len(video_detailed)

100

In [43]:
video_details = video_detailed[50:]

In [44]:
async def sentiment_analysis_batch(region_code, cat_id, sort_by):
    try:
        if videos is None:
            print("Error fetching videos")
            return None
        failed_videos = []
        skip_com = 0
        if not video_details:
            print("No valid video details found")
            return None
        tot_vid_fetch = len(video_details)
        print(f'Total videos fetched: {tot_vid_fetch}')

        combined_output = []
        header = [
            "ComId", "Vid", "VideoTitle", "AuthorName", "AuthorCid", "Comment", "Sentiment",
            "LikeCount", "ReplyCount", "PublishedAt", "RegionCode", "CategoryId"
        ]

        tot_comments_fetch, tot_comments_analyzed = 0, 0
        request_count = 0
        start_time = time.time()
        for i, video in enumerate(video_details):
            failed_vid_appended = False
            try:
                video_id = video["id"]
                video_title = video["snippet"]["title"]
                com_cnt = int(video['statistics']['commentCount'])
                print(f'-------------------------- VIDEO NO. {i + 1} --------------------------')
                output_rows = []
                print(f'Fetching Comments for video {i + 1}. {video_title}')
                print(f'Video consist of {com_cnt} comments.')
                data_sources = []
                if com_cnt > 200:
                    data_sources.append(await fetch_comments_data(video_id, youtube_api_2, 100, "relevance"))
                    data_sources.append(await fetch_comments_data(video_id, youtube_api_3, 100, "time"))
                elif com_cnt > 100:
                    data_sources.append(await fetch_comments_data(video_id, youtube_api_2, 100, "relevance"))
                    remaining_comments = com_cnt - 100
                    data_sources.append(await fetch_comments_data(video_id, youtube_api_3, remaining_comments, "time"))
                else:
                    data_sources.append(await fetch_comments_data(video_id, youtube_api_2, com_cnt, "relevance"))
                if not data_sources:
                    print(f"No Comments Extracted for video {video_id}")
                    if not failed_vid_appended:
                        failed_videos.append(video)
                        failed_vid_appended = True
                    continue
                comments_fetched = sum(len(source["items"]) for source in data_sources if source and "items" in source)
                tot_comments_fetch += comments_fetched
                print(f'Comment Extracted: {comments_fetched}')
                for data in data_sources:
                    if not data or "items" not in data:
                        continue
                    comments = []
                    cur_com_skip = 0
                    for item in data["items"]:
                        comment_thread = item["snippet"]["topLevelComment"]["snippet"]
                        comment_text = comment_thread.get("textOriginal")
                        cleaned_comment = remove_links(comment_text)
                        if has_multiple_timestamps(comment_text):
                            skip_com += 1
                            cur_com_skip += 1
                            continue
                        comments.append({
                            "ComId": item.get("id"),
                            "Vid": video_id,
                            "VideoTitle": video_title,
                            "AuthorName": comment_thread.get("authorDisplayName"),
                            "AuthorCid": comment_thread.get("authorChannelId", {}).get("value", ""),
                            "Comment": cleaned_comment,
                            "LikeCount": comment_thread.get("likeCount", 0),
                            "ReplyCount": item["snippet"].get("totalReplyCount", 0),
                            "PublishedAt": comment_thread.get("publishedAt"),
                            "RegionCode": region_code,
                            "CategoryId": cat_id
                        })
                    print(f'Comments Skipped: {cur_com_skip}') if cur_com_skip > 0 else None
                    for i in range(0, len(comments), 25):
                        comment_batch = comments[i:i + 25]
                        comment_texts = [f"{i + 1}. {comment['Comment']}" for i, comment in enumerate(comment_batch)]
                        if request_count == 45:
                            elapsed_time = time.time() - start_time
                            sleep_time = max(0, 90 - elapsed_time)
                            sleep_time = sleep_time if sleep_time >= 10 else 10
                            print(f"Rate limit reached. Sleeping for {sleep_time} seconds.")
                            time.sleep(sleep_time)
                            request_count = 0
                            start_time = time.time()
                        try:
                          sentiment_labels = await analyze_comments(video_title, comment_texts)
                          request_count += 1
                          if sentiment_labels is None:
                              if not failed_vid_appended:
                                  failed_videos.append(video)
                                  failed_vid_appended = True
                              continue
                          for comment, sentiment in zip(comment_batch, sentiment_labels):
                              comment["Sentiment"] = sentiment
                              output_rows.append(comment)
                        except Exception as e:
                          print(f"Error analyzing comments: {e}")
                tot_comments_analyzed += len(output_rows)
                print(f'Comments analyzed: {len(output_rows)}')
                combined_output.extend(output_rows)
                print(f"Total comments fetched: {tot_comments_fetch}, Total comments analyzed: {tot_comments_analyzed}")
                try:
                  if output_rows:
                      csv_filename = f"sentiments-{region_code}-{cat_id}-{sort_by}-{video_id}.csv"
                      with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
                          writer = csv.DictWriter(file, fieldnames=header)
                          writer.writeheader()
                          writer.writerows(output_rows)
                      print(f"File created: {csv_filename}")
                except Exception as e:
                    print(f"Error writing to CSV: {e}")

            except Exception as e:
                print(f"Error processing video {video_id}: {e}")
                if not failed_vid_appended:
                    failed_videos.append(video)
                    failed_vid_appended = True
                continue
        try:
            if combined_output:
                csv_filename = f"combined_sentiments-{region_code}-{cat_id}-{sort_by}.csv"
                with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
                    writer = csv.DictWriter(file, fieldnames=header)
                    writer.writeheader()
                    writer.writerows(combined_output)
                print(f"Combined file created: {csv_filename}")
        except Exception as e:
            print(f"Error writing combined CSV: {e}")

        print(f"Sentiment analysis completed.")
        print(f"Total videos fetched: {tot_vid_fetch}")
        print(f"Total comments fetched: {tot_comments_fetch}")
        print(f"Total comments analyzed: {tot_comments_analyzed}")
        print(f"Total comments skipped: {skip_com}")
        if failed_videos:
            print(f"Failed videos: {len(failed_videos)}")
            # return failed_videos
            return None

    except Exception as e:
        print(f"Error in sentiment_analysis_batch: {e}")

In [45]:
sentiments = await sentiment_analysis_batch('freeCodeCamp', 27, 'viewCount')
sentiments

Total videos fetched: 50
-------------------------- VIDEO NO. 1 --------------------------
Fetching Comments for video 1. Docker Tutorial for Beginners - A Full DevOps Course on How to Run Applications in Containers
Video consist of 1151 comments.
Comment Extracted: 200
Comments Skipped: 1
Number of comments Analyzed: 25
Number of comments Analyzed: 25
Number of comments Analyzed: 25
Number of comments Analyzed: 24
Comments Skipped: 1
Number of comments Analyzed: 25
Number of comments Analyzed: 25
Number of comments Analyzed: 25
Number of comments Analyzed: 24
Comments analyzed: 198
Total comments fetched: 200, Total comments analyzed: 198
File created: sentiments-freeCodeCamp-27-viewCount-fqMOX6JJhGo.csv
-------------------------- VIDEO NO. 2 --------------------------
Fetching Comments for video 2. Learn PostgreSQL Tutorial - Full Course for Beginners
Video consist of 2086 comments.
Comment Extracted: 200
Comments Skipped: 3
Number of comments Analyzed: 25
Number of comments Analyzed