## __YouTube Platform: Same-sex marriage__

### __General__

#### Libraries

In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import sys, json, time, os, csv
from typing import List, Dict, Any
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

sys.path.append(os.path.abspath('..'))
from utils.helpers import unique_posts_videos 

#### Functions

In [None]:
def search_youtube_videos(
    youtube_object,
    keywords: List[str],
    region_code: str = 'GR',
    language: str = 'el',
    max_results: int = 5,
    published_after: str = None,
    delay: float = 0
    ) -> List[Dict]:
    """
    Searches YouTube for multiple keywords and returns video metadata.

    Args:
        youtube_object: Authenticated YouTube API object.
        keywords (List[str]): List of keyword queries.
        region_code (str): Region code to localize the results.
        language (str): Language code to prefer results in.
        max_results (int): Max results per keyword.
        published_after (str): ISO 8601 date (e.g. '2024-01-01T00:00:00Z') to filter.
        delay (float): Delay in seconds between API calls (rate limiting).

    Returns:
        List[Dict]: List of video metadata dictionaries.
    """

    all_videos = []

    for query in keywords:
        try:
            # search for videos
            search_request = youtube_object.search().list(
                part='snippet',
                q=query,
                type='video',
                regionCode=region_code,
                relevanceLanguage=language,
                maxResults=max_results,
                publishedAfter=published_after
            )
            search_response = search_request.execute()
            video_ids = [item['id']['videoId'] for item in search_response.get('items', [])]

            if not video_ids:
                continue

            # fetch metadata
            details_request = youtube_object.videos().list(
                part='snippet,statistics',
                id=','.join(video_ids)
            )
            details_response = details_request.execute()

            for item in details_response.get('items', []):
                snippet = item['snippet']
                stats = item.get('statistics', {})
                video_data = {
                    'keyword': query,
                    'video_id': item['id'],
                    'title': snippet.get('title'),
                    'published_at': snippet.get('publishedAt'),
                    'channel_id': snippet.get('channelId'),
                    'channel_title': snippet.get('channelTitle'),
                    'view_count': int(stats.get('viewCount', 0)),
                    'like_count': int(stats.get('likeCount', 0)),
                    'comment_count': int(stats.get('commentCount', 0))
                }
                all_videos.append(video_data)

            time.sleep(delay)

        except Exception as e:
            print(f"Error while processing keyword '{query}': {e}")
            continue

    return all_videos

def fetch_youtube_comments_forest(youtube_object, video_ids):
    """
    Fetch all comments from YouTube videos, structured like Reddit comment forests.

    Args:
        youtube_object: Authenticated YouTube API object.
        video_ids (List[str]): List of YouTube video IDs.

    Returns:
        List[Dict]: List of comment forests with hierarchical IDs.
    """
    forests = []

    def fetch_all_replies(parent_id):
        replies = []
        next_page_token = None
        while True:
            response = youtube_object.comments().list(
                part="snippet",
                parentId=parent_id,
                maxResults=100,
                pageToken=next_page_token
            ).execute()
            replies.extend(response.get("items", []))
            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break
            time.sleep(2)
        return replies

    def index_comment_tree(comments, parent_id, prefix, depth):

        indexed = []

        for i, comment in enumerate(comments, start=1):

            hier_id = f"{prefix}{i}" if prefix == "" else f"{prefix}.{i}"
            comment_data = {
                "hier_id": hier_id,
                "comment_id": comment["id"],
                "body": comment["snippet"]["textDisplay"],
                "like_count": comment["snippet"].get("likeCount", 0),
                "parent_id": parent_id,
                "depth": depth,
                "author": comment["snippet"].get("authorDisplayName"),
                "published_at": comment["snippet"].get("publishedAt"),
            }
            indexed.append(comment_data)

            # fetch replies if not present
            reply_items = fetch_all_replies(comment["id"])
            if reply_items:
                indexed.extend(index_comment_tree(
                    reply_items,
                    parent_id=comment["id"],
                    prefix=hier_id,
                    depth=depth + 1
                ))

        return indexed

    for video_id in video_ids:

        all_comments = []
        next_page_token = None

        while True:
            response = youtube_object.commentThreads().list(
                part="snippet,replies",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token
            ).execute()

            threads = response.get("items", [])

            for thread in threads:
                top_comment = thread["snippet"]["topLevelComment"]


                replies = thread.get("replies", {}).get("comments", [])
                thread_data = {
                    "id": top_comment["id"],
                    "snippet": top_comment["snippet"],
                    "replies": {"comments": replies} if replies else None
                }
                all_comments.append(thread_data)

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break
            time.sleep(2)

        # flatten and assign hier_ids
        indexed_comments = index_comment_tree(
            all_comments, 
            parent_id=f"yt_{video_id}", 
            prefix="", 
            depth=0)

        forests.append({
            "video_id": video_id,
            "comments": indexed_comments
        })

    return forests

def fetch_all_youtube_comments(youtube_object, videos, log_path="youtube_comment_log.csv", out_dir="youtube_comments"):
    os.makedirs(out_dir, exist_ok=True)

    # load existing log
    completed_videos = set()
    if os.path.exists(log_path):
        with open(log_path, newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if row.get("success") == "True":
                    completed_videos.add(row["video_id"])

    # open log file for appending
    log_file = open(log_path, "a", newline='', encoding="utf-8")
    log_writer = csv.DictWriter(log_file, fieldnames=["video_id", "success", "fetched_count", "expected_count"])
    if log_file.tell() == 0:
        log_writer.writeheader()

    for i, video in enumerate(videos):
        video_id = video["video_id"]
        expected_count = int(video.get("comment_count", 0))

        if expected_count == 0:
            print(f"[SKIP] Video {video_id} has 0 expected comments.")
            log_writer.writerow({
                "video_id": video_id,
                "success": True,
                "fetched_count": 0,
                "expected_count": 0
            })
            log_file.flush()
            continue

        if video_id in completed_videos:
            print(f"[SKIP] Video {video_id} already processed.")
            continue

        print(f"[{i+1}/{len(videos)}] Fetching comments for: {video_id}...")

        try:
            result = fetch_youtube_comments_forest(youtube_object, video_ids=[video_id])
            fetched_count = len(result[0]["comments"]) if result else 0

            # save
            with open(os.path.join(out_dir, f"{video_id}.json"), "w", encoding="utf-8") as jf:
                json.dump(result, jf, ensure_ascii=False, indent=2)

            # log result
            success = (fetched_count >= expected_count) if expected_count > 0 else True
            log_writer.writerow({
                "video_id": video_id,
                "success": success,
                "fetched_count": fetched_count,
                "expected_count": expected_count
            })
            log_file.flush()

            print(f"Done. Fetched {fetched_count} / {expected_count} comments.")

        except Exception as e:
            print(f"Failed to fetch for {video_id}: {e}")
            log_writer.writerow({
                "video_id": video_id,
                "success": False,
                "fetched_count": 0,
                "expected_count": expected_count
            })
            log_file.flush()
            time.sleep(2)

    log_file.close()


#### Initialization

In [3]:
load_dotenv()
api_key = os.getenv("YT_API_KEY")

if not api_key:
    raise ValueError("API key not found in .env file")

youtube_object = build('youtube', 'v3', developerKey=api_key)

### __Search videos__

#### Videos

In [18]:
greek_keywords = ["ομόφυλα ζευγάρια", 
                  "ομόφυλα τεκνοθεσία", 
                  "ισότητα στο πολιτικό γάμο", 
                  "γάμος ομόφυλων", 
                  "γάμος ομόφυλων ζευγαριών"]

In [19]:
# search for the videos based on the keywords
video_search_results = search_youtube_videos(
    youtube_object=youtube_object,
    keywords=greek_keywords,
    region_code='GR', 
    language='el', 
    max_results=50 
    )

In [20]:
clean_data, duplicates = unique_posts_videos(video_search_results, id_key="video_id")

In [None]:
len(video_search_results), len(clean_data), len(duplicates)

#### Save

In [22]:
save_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("youtube_api.ipynb")))) + "\\outputs\\api_queried\\youtube_api\\youtube_scraped_videos.json"

with open(save_path, "w", encoding="utf-8") as f:
    json.dump(clean_data, f, ensure_ascii=False, indent=4)

### __Search comments__

In [None]:
yt_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("youtube_api.ipynb")))) + "\\outputs\\api_queried\\youtube_api"
log_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("youtube_api.ipynb")))) + "\\outputs\\logs\\youtube_comment_log.csv"
log_path 

In [5]:
with open(yt_path + "\\youtube_scraped_videos.json", "r", encoding="utf-8") as f:
    videos = json.load(f)

In [6]:
video_ids_iter = [video["video_id"] for video in videos]

In [None]:
comments = fetch_all_youtube_comments(youtube_object, 
                                      videos=videos, 
                                      out_dir=yt_path,
                                      log_path=log_path) 