In [None]:
pip install youtube-transcript-api pandas

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Downloading youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.1/485.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.2.3


In [None]:
import pandas as pd
import time
import os
from youtube_transcript_api._api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
import csv


INPUT_FILE = "/content/drive/MyDrive/queryTubeVideoDataset.csv"
OUTPUT_FILE = "transcripts.csv"
BATCH_SIZE = 10
DELAY = 10

# LOAD EXISTING PROGRESS
if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)
    print(f"Loaded existing progress: {len(df)} videos")
else:
    df = pd.read_csv(INPUT_FILE)
    df["transcript"] = None

# Filter videos that don't have transcripts yet
pending_videos = df[df["transcript"].isna()]
print(f"Videos remaining: {len(pending_videos)}")

# Process only up to the batch limit
video_ids = pending_videos["id"].tolist()[:BATCH_SIZE]

ytt_api = YouTubeTranscriptApi()

for i, vid in enumerate(video_ids, start=1):
    print(f"\nProcessing {i}/{BATCH_SIZE}: {vid}")
    try:
        transcript = ytt_api.fetch(vid)
        transcript_text = " ".join([snippet.text for snippet in transcript])
        df.loc[df["id"] == vid, "transcript"] = transcript_text
        print(f"Transcript fetched for {vid}")
    except NoTranscriptFound:
        print(f"No transcript found for {vid}")
        df.loc[df["id"] == vid, "transcript"] = "No transcript found"
    except TranscriptsDisabled:
        print(f"Transcripts disabled for {vid}")
        df.loc[df["id"] == vid, "transcript"] = "Transcripts disabled"
    except Exception as e:
        print(f"Error fetching transcript for {vid}: {e}")
        break  # stop here to prevent IP ban

    # Delay to avoid IP blocks
    time.sleep(DELAY)

# Save progress
df.to_csv(OUTPUT_FILE,
    index=False,
    encoding="utf-8",
    quoting=csv.QUOTE_ALL)
print("\n Progress saved in 'videos_with_transcripts.csv'")
print(f" Next time you run the script, it will continue from where it stopped!")


✅ Loaded existing progress: 50 videos
📺 Videos remaining: 10

🎬 Processing 1/10: XMoK33jsgcs
✅ Transcript fetched for XMoK33jsgcs

🎬 Processing 2/10: 2NfTEi1CyYs
✅ Transcript fetched for 2NfTEi1CyYs

🎬 Processing 3/10: M-DzoNzmJI0
✅ Transcript fetched for M-DzoNzmJI0

🎬 Processing 4/10: Fmw1i1cBYq4
✅ Transcript fetched for Fmw1i1cBYq4

🎬 Processing 5/10: 2TWCEdezzqI
✅ Transcript fetched for 2TWCEdezzqI

🎬 Processing 6/10: Rf9jbnzt2UU
✅ Transcript fetched for Rf9jbnzt2UU

🎬 Processing 7/10: 02dzgC_Ba70
✅ Transcript fetched for 02dzgC_Ba70

🎬 Processing 8/10: 304i_BWgXNc
✅ Transcript fetched for 304i_BWgXNc

🎬 Processing 9/10: cp0_xWxLGaI
✅ Transcript fetched for cp0_xWxLGaI

🎬 Processing 10/10: e8aWaQAb574
✅ Transcript fetched for e8aWaQAb574

💾 Progress saved in 'videos_with_transcripts.csv'
🔁 Next time you run the script, it will continue from where it stopped!
