In [1]:
import googleapiclient.discovery
import pandas as pd
import json
from googleapiclient.errors import HttpError  # Import HttpError to handle HTTP exceptions

# Define your API key
API_KEY = 'AIzaSyDgT6rFB2QXbfllRk-MVAZAMYAFz4DmorM'  
# API_KEY = 'AIzaSyCsIWUkBsJSaKg9ziO5H-Hs78MwHIK8LLM'

# Create a YouTube API client
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

def get_video_details_by_hashtag(hashtag, max_results=50):
    """
    Get details of videos matching the hashtag. Default max_results is set to 50 (maximum allowed).
    This function also handles pagination to retrieve all videos matching the hashtag.
    """
    videos = []
    next_page_token = None

    while True:
        # Request to search videos using the hashtag
        request = youtube.search().list(
            q=hashtag,  # Use the hashtag as the search query
            part="snippet",
            type="video",
            maxResults=max_results,  # Set maxResults to the maximum (50)
            pageToken=next_page_token  # Handle pagination with nextPageToken
        )
        response = request.execute()
        
        for item in response.get("items", []):
            video_data = {
                "video_id": item["id"]["videoId"],
                "author": item["snippet"]["channelTitle"],
                "title": item["snippet"]["title"],
                "description": item["snippet"]["description"],
                "publish_time": item["snippet"]["publishedAt"]
            }
            videos.append(video_data)

        # Check if there is another page of results
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return videos

def get_video_stats(video_id):
    """
    Get statistics (like count, view count, comment count) for a given video.
    """
    try:
        request = youtube.videos().list(
            part="statistics",
            id=video_id
        )
        response = request.execute()
        stats = response["items"][0]["statistics"]
        video_stats = {
            "view_count": int(stats.get("viewCount", 0)),
            "like_count": int(stats.get("likeCount", 0)),
            "comment_count": int(stats.get("commentCount", 0))
        }
        return video_stats
    except HttpError as e:
        print(f"An HTTP error occurred: {e}")
        return {}

def get_comments(video_id, max_results=100):
    """
    Get comments for a given video ID. Default max_results is set to 100 (maximum allowed).
    This function also handles pagination to retrieve all comments for a video.
    """
    comments = []
    next_page_token = None

    while True:
        try:
            # Get comments for a given video ID
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id,
                maxResults=max_results,  # Set maxResults to the maximum (100)
                textFormat="plainText",
                pageToken=next_page_token  # Handle pagination with nextPageToken
            )
            response = request.execute()
            
            for item in response.get("items", []):
                comment_data = item["snippet"]["topLevelComment"]["snippet"]

                # Capture top-level comment information
                comment = {
                    "text": comment_data["textDisplay"],
                    "like_count": comment_data.get("likeCount", 0),
                    "author": comment_data["authorDisplayName"],
                    "published_at": comment_data["publishedAt"],
                    "replies": []  # Initialize an empty list for replies
                }
                
                # Check if there are replies
                if "replies" in item:
                    for reply in item["replies"]["comments"]:
                        reply_data = reply["snippet"]
                        comment["replies"].append({
                            "text": reply_data["textDisplay"],
                            "author": reply_data["authorDisplayName"],
                            "published_at": reply_data["publishedAt"],
                            "like_count": reply_data.get("likeCount", 0)
                        })
                
                comments.append(comment)
            
            # Check if there is another page of comments
            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

        except HttpError as e:
            error_code = e.resp.status
            if error_code == 403:
                # Handle case where comments are disabled
                error_reason = json.loads(e.content).get("error", {}).get("errors", [{}])[0].get("reason", "")
                if error_reason == "commentsDisabled":
                    print(f"Comments are disabled for video ID: {video_id}")
                else:
                    print(f"An HTTP 403 error occurred: {e}")
            elif error_code == 400:
                # Handle case where the request is malformed
                print(f"An HTTP 400 error occurred. Possible reasons could be an invalid video ID or incorrect parameters. Error details: {e}")
            else:
                # Handle other HTTP errors
                print(f"An HTTP error occurred: {e}")
            break

    return comments


# Function to get hashtags from video description
def extract_hashtags(description):
    hashtags = [word for word in description.split() if word.startswith('#')]
    return hashtags


In [3]:
import json

# Initialize an empty list to store results
data = []
# Set a limit for the total number of records to be collected per hashtag
HASHTAG_LIMIT = 250

# Define your list of hashtags

hashtags = ["#paris2024", "#tokyo2020", "#rio2016"]
# Iterate through each hashtag to collect video data
for hashtag in hashtags:
    hashtag_data = []  # List to store data for each individual hashtag
    print(f"Collecting data for hashtag: {hashtag}")

    # Retrieve video details using the hashtag with a maximum of 50 results per request and handle pagination
    video_details = get_video_details_by_hashtag(hashtag, max_results=50)
    
    for video in video_details:
        # Check if the number of records for this hashtag has reached the limit
        if len(hashtag_data) >= HASHTAG_LIMIT:
            break  # Exit the loop when the limit is reached for this hashtag

        # Get video statistics (like count, view count, comment count)
        video_stats = get_video_stats(video["video_id"])
        
        # Get comments, replies, and likes for comments with a maximum of 100 results per request and handle pagination
        comments = get_comments(video["video_id"], max_results=100)
        
        # Extract hashtags from the description
        extracted_hashtags = extract_hashtags(video["description"])
        
        # Combine all information into one data structure and add it to the hashtag-specific list
        hashtag_data.append({
            "Hashtag": hashtag,
            "Video Title": video["title"],
            "Video Tuthor": video["author"],
            "Description": video["description"],
            "Extracted Hashtags": extracted_hashtags,
            "View Count": video_stats.get("view_count", 0),
            "Like Count": video_stats.get("like_count", 0),
            "Comment Count": video_stats.get("comment_count", 0),
            "Comments": comments,
            "Publish Time": video["publish_time"]
        })

    # Add the data collected for the hashtag to the main data list
    data.extend(hashtag_data)
    print(f"Total records collected for hashtag '{hashtag}': {len(hashtag_data)}")

# Print the total number of records collected for all hashtags
print(f"Total records collected for all hashtags: {len(data)}")

# Save data to a JSON file
with open("youtube_olympics_data_limited_1000_per_hashtag.json", "w") as json_file:
    json.dump(data, json_file, indent=4)



Collecting data for hashtag: #paris2024
Comments are disabled for video ID: TSkbzGvCyEc
Comments are disabled for video ID: xAooUNe_tBw
Comments are disabled for video ID: hbPY28j-zT4
Total records collected for hashtag '#paris2024': 250
Collecting data for hashtag: #tokyo2020
Comments are disabled for video ID: 0Wmwc8u98lI
Comments are disabled for video ID: ixE1a6_l7e4
Comments are disabled for video ID: bzdxwj4422Q
Comments are disabled for video ID: LqeB__O5B7k
Comments are disabled for video ID: MmdtZMi4mQ0
Comments are disabled for video ID: TM4CYbTNzGc
Comments are disabled for video ID: ZCxiPW-QMRY
Comments are disabled for video ID: QJG6duYxpMY
Comments are disabled for video ID: QJG6duYxpMY
Comments are disabled for video ID: n0RxOm0dhGg
Comments are disabled for video ID: _e_dnSMEPQw
Comments are disabled for video ID: GfIzR4RXFag
Comments are disabled for video ID: Y31qV7mK1L4
Comments are disabled for video ID: pCK2pRbwnUk
Comments are disabled for video ID: c-zzmf-1NE0
Co

In [4]:
len(data)

750