In [1]:
import requests
import csv
import os
import asyncio
import aiohttp
import nest_asyncio
nest_asyncio.apply()

In [None]:
comment_file = "5"

In [3]:
with open('all_video_ids.txt', 'r') as file:
    total_videos = [line.strip() for line in file]

print("Total Videos: " + str(len(total_videos)))

with open('video_ids.txt', 'r') as file:
    all_videos = [line.strip() for line in file]
print("Total Correct Videos: " + str(len(all_videos)))

with open('completed_ids.txt', 'r') as file:
    completed_videos = [line.strip() for line in file]
    
print("Completed Videos: " + str(len(completed_videos)))

Total Videos: 787974
Total Correct Videos: 440772
Completed Videos: 67009


In [4]:
# Videos left -  slow iterative method
# video_ids = [video_id for video_id in all_videos if video_id not in completed_videos]
# print("Left Videos: " + str(len(video_ids)))

In [5]:
# Videos left -  fast set difference method
video_ids = list(set(all_videos) - set(completed_videos))
print("Left Videos: " + str(len(video_ids)))

Left Videos: 376591


In [6]:
# Constants
API_KEY = "your_yt_api_key"
VIDEO_IDS = video_ids[:500]
CSV_FILE = f"comment_data/comment_data{comment_file}.csv"
COMPLETED_IDS_FILE = "completed_ids.txt"
INVALID_IDS_FILE = "invalid_video_ids.txt"
CSV_FIELDS = [
    "video_id",
    "comment_id",
    "parent_id",
    "comment_text",
    "author_name",
    "author_channel_id",
    "author_url",
    "published_at",
    "can_rate",
    "can_reply",
    "like_count",
    "reply_count",
    "is_public"
]

In [7]:
# Define functions
async def fetch_comments_async(video_id, session):
    base_url = "https://www.googleapis.com/youtube/v3/commentThreads"
    url = f"{base_url}?key={API_KEY}&videoId={video_id}&part=snippet,replies&maxResults=100"
    comments = []

    while url:
        try:
            response = await session.get(url)
            data = await response.json()

            if "items" not in data:
                print(f"Video ID {video_id} has no comments.")
                save_invalid_id(video_id)
                return comments

            for item in data["items"]:
                comment_thread = item.get("snippet", {})
                top_comment = comment_thread.get("topLevelComment", {}).get("snippet", {})

                comment_id = item.get("id", "")
                parent_id = ""

                comment = {
                    "video_id": video_id,
                    "comment_id": comment_id,
                    "parent_id": parent_id,
                    "comment_text": top_comment.get("textDisplay", ""),
                    "author_name": top_comment.get("authorDisplayName", ""),
                    "author_channel_id": top_comment.get("authorChannelId", {}).get("value", ""),
                    "author_url": top_comment.get("authorChannelUrl", ""),
                    "published_at": top_comment.get("publishedAt", ""),
                    "can_rate": top_comment.get("canRate", False),
                    "can_reply": comment_thread.get("canReply", False),
                    "like_count": top_comment.get("likeCount", 0),
                    "reply_count": comment_thread.get("totalReplyCount", 0),
                    "is_public": comment_thread.get("isPublic", True),
                }

                comments.append(comment)

                if not is_comment_in_memory(comment):
                    append_comment_to_memory(comment)

                if "replies" in item:
                    for reply in item["replies"]["comments"]:
                        reply_comment = {
                            "video_id": video_id,
                            "comment_id": reply.get("id", ""),
                            "parent_id": comment_id,
                            "comment_text": reply["snippet"].get("textDisplay", ""),
                            "author_name": reply["snippet"].get("authorDisplayName", ""),
                            "author_channel_id": reply["snippet"].get("authorChannelId", {}).get("value", ""),
                            "author_url": reply["snippet"].get("authorChannelUrl", ""),
                            "published_at": reply["snippet"].get("publishedAt", ""),
                            "can_rate": reply["snippet"].get("canRate", False),
                            "can_reply": False,
                            "like_count": reply["snippet"].get("likeCount", 0),
                            "reply_count": 0,
                            "is_public": reply["snippet"].get("isPublic", True),
                        }

                        comments.append(reply_comment)
                        append_comment_to_memory(reply_comment)

            if "nextPageToken" in data:
                url = f"{base_url}?key={API_KEY}&videoId={video_id}&part=snippet,replies&maxResults=100&pageToken={data['nextPageToken']}"
            else:
                url = None

        except requests.exceptions.HTTPError as err:
            print(f"An error occurred while fetching comments for video ID {video_id}: {err}")
            save_invalid_id(video_id)
            return comments

    append_comments_to_csv(comments)
    save_completed_id(video_id)
    print(f"Fetched {len(comments)} comments for video ID {video_id}")

def is_comment_in_memory(comment):
    # Check if a comment already exists in memory (set)
    return comment.get("comment_id") in comment_ids

def append_comment_to_memory(comment):
    # Append comment to the memory (set) to check for duplicates
    comment_ids.add(comment.get("comment_id"))

def append_comments_to_csv(comments):
    # Append a batch of comments to the CSV file
    with open(CSV_FILE, "a+", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=CSV_FIELDS)
        for comment in comments:
            writer.writerow({
                "video_id": comment.get("video_id"),
                "comment_id": comment.get("comment_id"),
                "parent_id": comment.get("parent_id"),
                "comment_text": comment.get("comment_text"),
                "author_name": comment.get("author_name"),
                "author_channel_id": comment.get("author_channel_id"),
                "author_url": comment.get("author_url"),
                "published_at": comment.get("published_at"),
                "can_rate": comment.get("can_rate"),
                "can_reply": comment.get("can_reply"),
                "like_count": comment.get("like_count"),
                "reply_count": comment.get("reply_count"),
                "is_public": comment.get("is_public")
            })

def is_id_invalid(video_id):
    if not os.path.exists(INVALID_IDS_FILE):
        return False
    with open(INVALID_IDS_FILE, "r") as file:
        invalid_ids = file.read().splitlines()
        return video_id in invalid_ids

def is_id_completed(video_id):
    if not os.path.exists(COMPLETED_IDS_FILE):
        return False
    with open(COMPLETED_IDS_FILE, "r") as file:
        completed_ids = file.read().splitlines()
        return video_id in completed_ids

def save_invalid_id(video_id):
    with open(INVALID_IDS_FILE, "a") as file:
        file.write(f"{video_id}\n")

def save_completed_id(video_id):
    with open(COMPLETED_IDS_FILE, "a") as file:
        file.write(f"{video_id}\n")

async def main_async():
    tasks = []
    async with aiohttp.ClientSession() as session:
        for video_id in VIDEO_IDS:
            if is_id_invalid(video_id):
                # print(f"Skipping invalid video ID {video_id}.")
                continue

            if is_id_completed(video_id):
                # print(f"Skipping completed video ID {video_id}.")
                continue

            task = asyncio.ensure_future(fetch_comments_async(video_id, session))
            tasks.append(task)

        await asyncio.gather(*tasks)

    print("All comments fetched and saved.")


In [8]:
if __name__ == "__main__":
    # Initialize a set to store comment IDs in memory
    comment_ids = set()

    asyncio.run(main_async())

Video ID fZUGLGLVHoM has no comments.
Video ID 5IUOVxJ3Vxw has no comments.
Video ID XNcvDJDmhM4 has no comments.
Fetched 0 comments for video ID DEujwAJZFoE
Fetched 0 comments for video ID WowvZdZFDtY
Fetched 0 comments for video ID PfwX64AkoMg
Fetched 0 comments for video ID ol74emIa6zQ
Fetched 0 comments for video ID j6Qvla4P9dM
Fetched 0 comments for video ID d6f_AXIC3Gc
Fetched 0 comments for video ID OPcPZS4AE4Q
Fetched 0 comments for video ID DOYhHsqtuXo
Fetched 0 comments for video ID 4vtJCszn_DE
Video ID 3x-D9TsF0vE has no comments.
Fetched 0 comments for video ID tJsXarQ7csU
Video ID 2tXwhBBUgYk has no comments.
Fetched 0 comments for video ID _xxmS4r6uXc
Fetched 1 comments for video ID iTSbG-AnbLg
Fetched 0 comments for video ID _SyC13vuJHU
Fetched 0 comments for video ID G40OhPt0DNU
Fetched 0 comments for video ID VnvYjU08kro
Fetched 0 comments for video ID DOPGzTmCeSQ
Fetched 0 comments for video ID Q4COuYgoFjw
Fetched 0 comments for video ID Ls1Ujuc9_cI
Fetched 0 comments