In [None]:
!pip install youtube-transcript-api


Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Downloading youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.1/485.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.2.3


In [None]:
# Install Whisper and youtube_dl to download video/audio
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q yt-dlp
!sudo apt update && sudo apt install -y ffmpeg


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.9/175.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https://cli.github.com/packages stable/main amd64 Packages [3

In [None]:
import requests
import csv
import concurrent.futures

API_KEY = "qZVk62vPY4Ua6CQcYMQm5VfX"
INPUT_CSV = "ProgrammingWithHarry.csv"
OUTPUT_CSV = "output_transcripts.csv"

def fetch_transcript(video_id):
    """Fetch transcript for one video"""
    url = "https://www.searchapi.io/api/v1/search"
    headers = {"Authorization": f"Bearer {API_KEY}"}
    params = {
        "engine": "youtube_transcripts",
        "video_id": video_id,
        "lang": "en",
        "transcript_type": "auto"
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            return video_id, f"❌ Error {response.status_code}"
        data = response.json()

        if "transcripts" not in data or not data["transcripts"]:
            return video_id, "⚠️ No transcript available"

        transcript_text = " ".join(seg["text"] for seg in data["transcripts"])
        return video_id, transcript_text.strip()

    except Exception as e:
        return video_id, f"🚨 Error: {e}"

def process_videos():
    results = []
    with open(INPUT_CSV, "r", encoding="utf-8") as infile:
        reader = csv.DictReader(infile)
        video_ids = [row["id"].strip() for row in reader]

    print(f"🚀 Fetching transcripts for {len(video_ids)} videos in parallel...")

    # Use up to 10 threads for speed
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_transcript, vid): vid for vid in video_ids}
        for future in concurrent.futures.as_completed(futures):
            video_id, transcript = future.result()
            results.append({"video_id": video_id, "transcript": transcript})
            print(f"✅ {video_id} processed")

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as outfile:
        writer = csv.DictWriter(outfile, fieldnames=["video_id", "transcript"])
        writer.writeheader()
        writer.writerows(results)

    print("\n🎉 All transcripts saved successfully to:", OUTPUT_CSV)

if __name__ == "__main__":
    process_videos()


🚀 Fetching transcripts for 113 videos in parallel...
✅ oAYuGx8BINI processed
✅ jaq1KWi2_DU processed
✅ 2qXziTfLTvQ processed
✅ FIPOaXnj5Ho processed
✅ UaQZAr2cTV8 processed
✅ XKqCWD_JoxQ processed
✅ s1MUcxO-x8Q processed
✅ bj_WGitydcc processed
✅ ZxQ5ERe43YE processed
✅ NU-laLCNpjI processed
✅ lXalx9RLMlQ processed
✅ Zcspn-B3V-E processed
✅ fpWfFghWVp4 processed
✅ PBqwAJgrw3k processed
✅ S5uLkxNUk-M processed
✅ wRWdRwAb9DM processed
✅ zbz_q7uUPss processed
✅ Z51jMk6N1zY processed
✅ fl2wY8XnNP4 processed
✅ JF6c8jvS3c8 processed
✅ agXU_IjTy0Y processed
✅ -DWYKXE0qy0 processed
✅ hraAvEwZ3o4 processed
✅ DW7OaXN9wMs processed
✅ pF3KXOnxqr4 processed
✅ 6o8SL23h_xY processed
✅ OAVOl-aSOJc processed
✅ zJNgK7VXp54 processed
✅ qHB2jUvAlGo processed
✅ _MF8L7ZxwRE processed
✅ bhIK5gB5BDQ processed
✅ 7Z_ZTsop7qQ processed
✅ EyEqWFvLDT8 processed
✅ zg1PacxR4r8 processed
✅ VdYnH9z6CeY processed
✅ yvyFzj0C60Y processed
✅ al0tAGjpMO4 processed
✅ JO58rTZI3ak processed
✅ kkfXDu8jc9I processed
✅ IdCs-an5Q