In [1]:
import json
import re
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from urllib.parse import unquote
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyBCkoiCjVlWEqKXxPXyRN_IjweChVmlOTA"
file_path = "watch-history.json"
sia = SentimentIntensityAnalyzer()

def extract_video_id(link):
    match = re.search(r"v=([a-zA-Z0-9_-]+)", link)
    return match.group(1) if match else None

def read_video_data_from_file(file_path):
    with open(file_path, 'r') as file:
        watch_history = json.load(file)

    video_data_list = []
    for entry in watch_history:
        video_data = {
            "header": entry.get("header", ""),
            "title": entry.get("title", ""),
            "titleUrl": entry.get("titleUrl", ""),
            "subtitles": entry.get("subtitles", []),
            "time": entry.get("time", ""),
            "products": entry.get("products", []),
            "activityControls": entry.get("activityControls", [])
        }
        video_data_list.append(video_data)
    return video_data_list

def get_video_comments(video_id):
    youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=10
        )
        response = request.execute()
        comments = [item['snippet']['topLevelComment']['snippet']['textDisplay'] for item in response.get('items', [])]
        return comments
    except HttpError as e:
        print(f"Error retrieving comments for video ID {video_id}: {e}")
        return []

def get_video_description(video_id):
    youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)
    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        if 'items' in response and response['items'] and len(response['items']) > 0:
            description = response['items'][0]['snippet'].get('description', '')
            return description
        else:
            print(f"No valid items found for video ID {video_id}")
            return ''
    except HttpError as e:
        print(f"Error retrieving description for video ID {video_id}: {e}")
        return ''

def get_video_tags(video_id):
    youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)
    try:
        request = youtube.videos().list(
            part="snippet",
            id=video_id
        )
        response = request.execute()
        if 'items' in response and response['items'] and len(response['items']) > 0:
            tags = response['items'][0]['snippet'].get('tags', [])
            return tags
        else:
            print(f"No valid items found for video ID {video_id}")
            return []
    except HttpError as e:
        print(f"Error retrieving tags for video ID {video_id}: {e}")
        return []

video_data_list = read_video_data_from_file(file_path)
video_ids = [extract_video_id(unquote(video_data.get("titleUrl", ""))) for video_data in video_data_list[:500] if extract_video_id(unquote(video_data.get("titleUrl", "")))]

# Group video IDs by creator
video_ids_by_creator = {}
for video_data, video_id in zip(video_data_list[:1000], video_ids):
    creator_name = video_data["subtitles"][0]["name"] if video_data["subtitles"] else "Unknown"
    if creator_name not in video_ids_by_creator:
        video_ids_by_creator[creator_name] = []
    video_ids_by_creator[creator_name].append(video_id)

creators_sentiment_comments = {}
creators_sentiment_description = {}
creators_similarity_tags = {}

for creator, creator_video_ids in video_ids_by_creator.items():
    total_sentiment_comments = 0
    total_comments = 0
    total_sentiment_description = 0
    total_similarity_tags = 0
    total_videos = len(creator_video_ids)

    for video_id in creator_video_ids:
        # Sentiment analysis for comments
        comments = get_video_comments(video_id)
        for comment in comments:
            sentiment_score = sia.polarity_scores(comment)
            total_sentiment_comments += sentiment_score['compound']
            total_comments += 1

        # Sentiment analysis for video description
        description = get_video_description(video_id)
        sentiment_score_description = sia.polarity_scores(description)
        total_sentiment_description += sentiment_score_description['compound']

        # Similarity calculation for video tags
        tags = get_video_tags(video_id)
        if tags:
            tags_str = ' '.join(tags)
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([tags_str, creator])
            similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
            total_similarity_tags += similarity[0][0]

    average_sentiment_comments = total_sentiment_comments / total_comments if total_comments > 0 else 0
    creators_sentiment_comments[creator] = average_sentiment_comments
    average_sentiment_description = total_sentiment_description / total_videos
    creators_sentiment_description[creator] = average_sentiment_description
    average_similarity_tags = total_similarity_tags / total_videos
    creators_similarity_tags[creator] = average_similarity_tags

# Display the top creator and the top 10 similar creators for all outputs
sorted_creators_comments = sorted(creators_sentiment_comments.items(), key=lambda x: x[1], reverse=True)
print(f"1. {sorted_creators_comments[0][0]} with an average comment sentiment of {sorted_creators_comments[0][1]}")

print("Top 10 creators most similar to sentiment comment score of top 1:")
for i, (creator, sentiment) in enumerate(sorted_creators_comments[1:11]):
    print(f"{i + 1}. {creator} with an average sentiment of {sentiment}")
print('\n')

sorted_creators_description = sorted(creators_sentiment_description.items(), key=lambda x: x[1], reverse=True)
print(f"1. {sorted_creators_description[0][0]} with an average description sentiment of {sorted_creators_description[0][1]}")

print("\nTop 10 creators most similar to top 1 sentiment description score:")
for i, (creator, sentiment) in enumerate(sorted_creators_description[1:11]):
    print(f"{i + 1}. {creator} with an average sentiment of {sentiment}")
print('\n')

sorted_creators_similarity_tags = sorted(creators_similarity_tags.items(), key=lambda x: x[1], reverse=True)
print(f"1. {sorted_creators_similarity_tags[0][0]} with an average tags similarity score of {sorted_creators_similarity_tags[0][1]}")

print("\nTop 10 creators most similar to top 1 tags score:")
for i, (creator, similarity) in enumerate(sorted_creators_similarity_tags[1:11]):
    print(f"{i + 1}. {creator} with an average similarity of {similarity}")


Error retrieving comments for video ID 9QWEbkeT-ag: <HttpError 404 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=9QWEbkeT-ag&maxResults=10&key=AIzaSyBCkoiCjVlWEqKXxPXyRN_IjweChVmlOTA&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter could not be found.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter could not be found.', 'domain': 'youtube.commentThread', 'reason': 'videoNotFound', 'location': 'videoId', 'locationType': 'parameter'}]">
No valid items found for video ID 9QWEbkeT-ag
No valid items found for video ID 9QWEbkeT-ag
Error retrieving comments for video ID Ko19LE0F9no: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=Ko19LE0F9no&maxResults=10&key=AIzaSyBCkoiCjVlWEqKXxPXyRN_IjweChVmlOTA&alt=js