# OctoPod Transcript Fetcher

This notebook fetches YouTube transcripts for videos that need them and commits the results to GitHub.

**Setup Required:**
1. Set `GITHUB_TOKEN` in Colab secrets
2. Run all cells

In [None]:
# Install dependencies
!pip install -q youtube-transcript-api

In [None]:
# Clone repository
import os
from google.colab import userdata

GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
REPO_URL = f"https://{GITHUB_TOKEN}@github.com/CoaseToCoase/OctoPod.git"

# Remove existing clone if present
!rm -rf OctoPod

# Clone the repo
!git clone {REPO_URL}
os.chdir('OctoPod')

# Configure git
!git config user.name "octopod-colab"
!git config user.email "octopod-colab@users.noreply.github.com"

print("✓ Repository cloned")

In [None]:
# Fetch transcripts
import json
from pathlib import Path
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import (
    NoTranscriptFound,
    TranscriptsDisabled,
    VideoUnavailable,
)

def fetch_transcript(video_id):
    """Fetch transcript for a video."""
    try:
        api = YouTubeTranscriptApi()
        transcript_data = api.fetch(video_id)
        formatted_text = " ".join(snippet.text for snippet in transcript_data)
        return formatted_text, None
    except TranscriptsDisabled:
        return None, "Transcripts disabled"
    except VideoUnavailable:
        return None, "Video unavailable"
    except NoTranscriptFound:
        return None, "No transcript found"
    except Exception as e:
        return None, str(e)

# Process all categories
categories = ["FPL Draft", "FPL Main", "Betting", "Politics"]
total_fetched = 0
total_failed = 0

for category in categories:
    print(f"\n{'='*50}")
    print(f"Processing: {category}")
    print('='*50)
    
    # Create safe category name for path (lowercase with underscores)
    safe_category = category.replace(" ", "_").lower()
    data_dir = Path(f"data/{safe_category}")
    videos_file = data_dir / "videos.json"
    
    if not videos_file.exists():
        print(f"⚠️  No videos.json found for {category}")
        continue
    
    # Read videos (stored as dict with video_id as key)
    with open(videos_file) as f:
        videos_dict = json.load(f)
    
    # Find videos without transcripts
    videos_needing_transcripts = [
        video for video_id, video in videos_dict.items()
        if not video.get('transcript')
    ]
    
    if not videos_needing_transcripts:
        print(f"✓ All videos have transcripts")
        continue
    
    print(f"Found {len(videos_needing_transcripts)} videos needing transcripts")
    
    # Fetch transcripts
    category_fetched = 0
    category_failed = 0
    
    for video in videos_needing_transcripts:
        video_id = video['id']
        print(f"  Fetching {video_id}...", end=" ")
        
        transcript, error = fetch_transcript(video_id)
        
        if transcript:
            # Update video with transcript
            videos_dict[video_id]['transcript'] = transcript
            print("✓")
            category_fetched += 1
        else:
            print(f"✗ ({error})")
            category_failed += 1
    
    # Save updated videos
    if category_fetched > 0:
        with open(videos_file, 'w') as f:
            json.dump(videos_dict, f, indent=2)
        print(f"✓ Saved {category_fetched} transcripts")
    
    total_fetched += category_fetched
    total_failed += category_failed

print(f"\n{'='*50}")
print(f"Summary: {total_fetched} fetched, {total_failed} failed")
print('='*50)

In [None]:
# Commit and push to GitHub
from datetime import datetime

if total_fetched > 0:
    commit_message = f"Add {total_fetched} transcripts via Colab - {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
    
    !git add data/
    !git commit -m "{commit_message}"
    !git push
    
    print(f"\n✓ Committed and pushed to GitHub")
    print(f"Message: {commit_message}")
else:
    print("\n⚠️  No changes to commit")