# Youtube Comment Scraper

## Imports and Setup

In [None]:
import requests
import json
from googleapiclient.discovery import build

In [None]:
#Simple time code remover
import re

def remove_timecodes(text):
    # Pattern for h:mm:ss, mm:ss, h.mm.ss, mm.ss — all optionally prefixed with @ or @
    timecode_pattern = r'(?<!\w)(?:@\s*)?(?:\d{1,2}[:.]){1,2}\d{2}(?!\w)'
    return re.sub(timecode_pattern, '', text)

## Channel Search

In [None]:
# API Key
api_key = ''

# Username of channel
channel_username = 'JonMalliaPodcast'

In [None]:
# YouTube Data API request to search for the channel ID based on the username
response = requests.get(
    'https://www.googleapis.com/youtube/v3/search',
    params={
        'part': 'snippet',
        'q': channel_username,  # Search for the channel name
        'type': 'channel',
        'key': api_key
    }
)

response_json = response.json()

# Extracting the channel ID if found, raisin error otherwise
if 'items' in response_json and len(response_json['items']) > 0:
    channel_id = response_json['items'][0]['id']['channelId']
else:
    print("Error: Unable to find channel.")
    print(response_json)  # Debugging
    raise ValueError("Invalid channel username or API response.")

## Video ID and Comment Extraction Functions

In [None]:
def get_all_video_ids(channel_id, api_key):
    youtube = build('youtube', 'v3', developerKey=api_key)
    video_ids = []
    next_page_token = None

    # Get the playlist ID for the channel's uploads
    channel_response = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    ).execute()
    uploads_playlist_id = channel_response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    # Fetch all video IDs from the uploads playlist
    while True:
        playlist_response = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=50,
            pageToken=next_page_token
        ).execute()

        for item in playlist_response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = playlist_response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids

In [None]:
def get_comments(video_id, api_key, max_comments=100):
    youtube = build('youtube', 'v3', developerKey=api_key)
    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100,  # Fetching 100 comments per request
            pageToken=next_page_token
        )
        try:
            response = request.execute()
        except Exception as e:
            print(f"Error fetching comments for video {video_id}: {e}")
            return comments  # Return an empty list if an error occurs

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            cleaned = remove_timecodes(comment)
            comments.append({"content": cleaned})
            if len(comments) >= max_comments:
                break

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments

## Comment Scraping Loop

In [None]:
# Fetching all video IDs from the channel
video_ids = get_all_video_ids(channel_id, api_key)

# Fetching comments from each video and storing them in a list
all_comments = []
for video_id in video_ids:
    print(f"Fetching comments for video: {video_id}")
    try:
        comments = get_comments(video_id, api_key, max_comments=1000)
        all_comments.extend(comments)
    except Exception as e:
        print(f"Error processing video {video_id}: {e}")
        continue  # Skip to the next video if an error occurs


## Saving the results

In [None]:
# Saving as JSON file
with open(f'{channel_username}.json', 'w', encoding='utf-8') as json_file:
    json.dump(all_comments, json_file, indent=4, ensure_ascii=False)

print("Comments exported to youtube_channel_comments.json")