In [None]:
from googleapiclient.discovery import build
from datetime import datetime, timezone, timedelta
import os
from models import VideoMetadata, ChannelVideos
from isodate import parse_duration
API_KEY = os.getenv("YOUTUBE_API_KEY")

def fetch_channel_to_create_channelvideos(channel_id) -> ChannelVideos:

    youtube = build("youtube", "v3", developerKey=API_KEY)

    thirty_days_ago = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat(timespec='seconds')
    
    channel_videos = ChannelVideos(channel_id)

    ch_resp = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    ).execute()

    uploads_playlist_id = ch_resp["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    video_ids = []
    next_token = None
    while True:
        pl_resp = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=50,
            pageToken=next_token
        ).execute()

        video_ids.extend(
            item["contentDetails"]["videoId"]
            for item in pl_resp.get("items", [])
        )

        next_token = pl_resp.get("nextPageToken")
        if not next_token:
            break
        
    print(f"Found {len(video_ids)} total videos in channel.")

    videos = []
    skip_shorts = skip_live = skip_new = 0

    for i in range(0, len(video_ids), 50):
        batch = video_ids[i : i + 50]
        vid_resp = youtube.videos().list(
            part="snippet,contentDetails,statistics,liveStreamingDetails",
            id=",".join(batch)
        ).execute()

        for item in vid_resp.get("items", []):
            # filter out shorts (< 5 minutes = 300 sec)
            dur = parse_duration(item["contentDetails"]["duration"]).total_seconds()
            if dur <= 300:
                skip_shorts += 1
                continue

            # filter out any live broadcasts
            if item.get("liveStreamingDetails"):
                skip_live += 1
                continue
            
            if item["snippet"]["publishedAt"] > thirty_days_ago:
                # if the video is older than 30 days, skip it
                skip_new += 1
                continue

            # keep it
            channel_videos.add(VideoMetadata.from_api(item))

    print(
        f"Processed {len(video_ids)} IDs → "
        f"{len(videos)} videos\n"
        f"Skipped shorts: {skip_shorts}, live: {skip_live}, new: {skip_new}"
    )
    return channel_videos

In [19]:
from collections import defaultdict
from models import ChannelVideos
buckets = defaultdict(lambda: None)

channel_ids = ["UCFH3gPaFetVFPHOiU5tUj8g"]

for channel_id in channel_ids:
    buckets[channel_id] = fetch_channel_to_create_channelvideos(channel_id)

for cid, bucket in buckets.items():
    print(f"{cid}: {bucket.total_videos} videos, avg views {bucket.average_view_count:.1f}")
    bucket.export_to_csv(f"../../data/raw/total.csv")


Found 960 total videos in channel.
Processed 960 IDs → 0 videos
Skipped shorts: 407, live: 129, new: 6
UCFH3gPaFetVFPHOiU5tUj8g: 418 videos, avg views 132393.1
