In [1]:
!pip install youtube-transcript-api transformers tqdm torch datasets


Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.3


In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
from tqdm import tqdm
from datasets import Dataset
import json
import torch


In [3]:
# YouTube Video IDs for Transcript Fetching
VIDEO_IDS = [
    "Gg25GfA456o",  # Video 1
    "idQb2pB-h2Q",  # Video 2
    "c6Bxbq8UdaI",  # Video 3
    "laWn7_cj434"   # Video 4
]

# Output Files for Processed Data
TRANSCRIPTS_FILE = "/kaggle/working/youtube_transcripts.json"
LABELED_DATA_FILE = "/kaggle/working/labeled_data.json"

# Check GPU Availability
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📊 Using device: {DEVICE}")


📊 Using device: cuda


In [5]:
def fetch_youtube_transcripts(video_ids, output_file=TRANSCRIPTS_FILE):
    transcripts = []

    for video_id in video_ids:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            transcripts.append({
                "video_id": video_id,
                "transcript": transcript
            })
            print(f"✅ Transcript fetched for video {video_id}")
        except Exception as e:
            print(f"❌ Error fetching transcript for video {video_id}: {e}")

    # Save transcripts to JSON
    with open(output_file, "w") as f:
        json.dump(transcripts, f, indent=4)
    print(f"📁 Transcripts saved to {output_file}")


In [6]:
def generate_labeled_data_batch(
    input_file=TRANSCRIPTS_FILE,
    output_file=LABELED_DATA_FILE,
    batch_size=16
):
    # Load transcripts
    with open(input_file, "r") as f:
        transcripts = json.load(f)

    # Initialize Hugging Face pipeline with GPU
    qa_generator = pipeline(
        "text2text-generation", 
        model="facebook/bart-large-cnn", 
        device=0 if DEVICE == "cuda" else -1
    )

    labeled_data = []

    # Flatten transcripts into a list of text segments
    all_segments = [
        {"text": segment["text"], "video_id": video["video_id"]}
        for video in transcripts
        for segment in video["transcript"]
    ]

    print(f"📊 Processing {len(all_segments)} transcript segments...")

    # Create Hugging Face Dataset
    dataset = Dataset.from_list(all_segments)

    # Define batch processing function
    def generate_qas(batch):
        questions = qa_generator(
            [f"Generate a question from: {text}" for text in batch["text"]],
            batch_size=batch_size,
        )
        return {
            "question": [q["generated_text"] for q in questions],
            "answer": batch["text"],
            "video_id": batch["video_id"],
        }

    # Process dataset in batches
    results = dataset.map(generate_qas, batched=True, batch_size=batch_size)

    # Collect labeled data
    for i in range(len(results)):
        labeled_data.append({
            "question": results[i]["question"],
            "answer": results[i]["answer"],
            "video_id": results[i]["video_id"]
        })

    # Save labeled data
    with open(output_file, "w") as f:
        json.dump(labeled_data, f, indent=4)

    print(f"📁 Labeled data saved to {output_file}")


In [7]:
fetch_youtube_transcripts(VIDEO_IDS)


✅ Transcript fetched for video Gg25GfA456o
✅ Transcript fetched for video idQb2pB-h2Q
✅ Transcript fetched for video c6Bxbq8UdaI
✅ Transcript fetched for video laWn7_cj434
📁 Transcripts saved to /kaggle/working/youtube_transcripts.json


In [9]:
generate_labeled_data_batch()

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

📊 Processing 6432 transcript segments...


Map:   0%|          | 0/6432 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


📁 Labeled data saved to /kaggle/working/labeled_data.json
