In [1]:
!pip install faiss-cpu faster_whisper yt_dlp google-genai

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting faster_whisper
  Downloading faster_whisper-1.2.1-py3-none-any.whl.metadata (16 kB)
Collecting yt_dlp
  Downloading yt_dlp-2025.10.22-py3-none-any.whl.metadata (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting ctranslate2<5,>=4.0 (from faster_whisper)
  Downloading ctranslate2-4.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster_whisper)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting av>=11 (from faster_whisper)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster_whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.me

In [3]:
import os
import shutil
from yt_dlp import YoutubeDL
from pydub import AudioSegment
from faster_whisper import WhisperModel
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from google import generativeai
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

#  CONFIG
genai.configure(api_key="AIzaSyAMrngkE6IaNAeojFvbbMOkHLukxgrQ5u8")
whisper = WhisperModel("small", device="cuda" if torch.cuda.is_available() else "cpu")
gemini = genai.GenerativeModel("gemini-2.0-flash")
embedder = SentenceTransformer("all-MiniLM-L6-v2")


#  SUMMARIZATION FUNCTION
def summarize_text(text):
    prompt = f"Provide a detailed, clear summary of the following transcript:\n\n{text}"
    response = gemini.generate_content(prompt)
    summary = response.text.strip()
    return summary


#  STEP 1: DOWNLOAD AUDIO
def download_audio(youtube_url, output_file="./audio/audio_file"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_file,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'noplaylist': True,
        'quiet': False,
        'continuedl': True
    }
    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(youtube_url, download=True)
        duration = info.get("duration", None)
        print(f"Audio downloaded — Duration: {duration//60 if duration else 'Unknown'} minutes")
    return output_file + ".mp3"


#  STEP 2: SPLIT LONG AUDIO
def split_audio(file_path, chunk_length_ms=5 * 60 * 1000):  # 5 minutes
    print("Splitting long audio into chunks...")
    audio = AudioSegment.from_file(file_path)
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    os.makedirs("chunks", exist_ok=True)
    chunk_files = []

    for i, chunk in enumerate(chunks):
        chunk_path = f"chunks/chunk_{i}.mp3"
        chunk.export(chunk_path, format="mp3")
        chunk_files.append(chunk_path)

    print(f"Split into {len(chunk_files)} chunks.")
    return chunk_files


#  STEP 3: TRANSCRIBE AND SUMMARIZE EACH CHUNK
def transcribe_and_summarize_chunks(chunk_files):
    print("\nTranscribing and summarizing audio chunks...\n")
    full_transcription = ""
    summaries = []
    text_chunks = []

    for i, chunk in enumerate(chunk_files):
        print(f"Processing chunk {i+1}/{len(chunk_files)} ...")
        segments, _ = whisper.transcribe(chunk, beam_size=3)
        chunk_text = " ".join([seg.text for seg in segments]).strip()
        text_chunks.append(chunk_text)

        print(f"Chunk {i+1} transcription done. Length: {len(chunk_text)} chars.")
        full_transcription += chunk_text + " "

        chunk_summary = summarize_text(chunk_text)
        summaries.append(f"--- Summary for chunk {i+1} ---\n{chunk_summary}\n")

    full_transcription = full_transcription.strip()
    final_summary = "\n".join(summaries).strip()

    print("\n=== FINAL CONCATENATED SUMMARY ===\n")
    print(final_summary)
    return full_transcription, final_summary, text_chunks


#  STEP 4: BUILD FAISS INDEX
def build_faiss_index(text_chunks):
    print("\nBuilding FAISS index...")
    embeddings = embedder.encode(text_chunks, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    print(f"Indexed {len(text_chunks)} chunks.")
    return index


#  STEP 5: RETRIEVE RELEVANT CONTEXT
def retrieve_relevant_chunks(query, index, text_chunks, top_k=3):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, top_k)
    return [text_chunks[i] for i in indices[0]]


#  STEP 6: GEMINI Q&A
def chat_with_gemini(index, text_chunks):
    print("\nConversational mode activated! (type 'exit' to quit)\n")
    conversation_history = []

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Conversation ended.")
            break

        # Retrieve relevant context for this turn
        context_snippets = retrieve_relevant_chunks(user_input, index, text_chunks)
        context = "\n".join(context_snippets)

        # Maintain a short conversation history
        recent_history = "\n".join(conversation_history[-4:])  # keep last 4 exchanges

        # Build prompt
        prompt = f"""You are an educational conversational assistant.
        Use the context below from a video transcript and prior chat history to reply naturally.

        Context:
        {context}

        Chat History:
        {recent_history}

        User: {user_input}
        Assistant:"""

        response = gemini.generate_content(prompt)
        reply = response.text.strip()

        print(f"Assistant: {reply}\n")

        conversation_history.append(f"User: {user_input}")
        conversation_history.append(f"Assistant: {reply}")


#  STEP 7: CLEANUP
def cleanup_files(audio_path="./audio/audio_file.mp3", chunks_dir="chunks"):
    try:
        if os.path.exists(audio_path):
            os.remove(audio_path)
        if os.path.exists(chunks_dir):
            for file_name in os.listdir(chunks_dir):
                file_path = os.path.join(chunks_dir, file_name)
                if os.path.isfile(file_path):
                    os.remove(file_path)
        else:
            os.makedirs(chunks_dir, exist_ok=True)
        print("Cleanup complete.")
    except Exception as e:
        print(f"Cleanup error: {e}")


#  MAIN PIPELINE
if __name__ == "__main__":
    url = input("Enter YouTube URL: ")
    audio_path = download_audio(url)
    chunk_files = split_audio(audio_path)
    transcribed_text, summarized_text, text_chunks = transcribe_and_summarize_chunks(chunk_files)

    index = build_faiss_index(text_chunks)

    chat_with_gemini(index, text_chunks)

    cleanup_files(audio_path, "chunks")

Enter YouTube URL: https://www.youtube.com/watch?v=ry9SYnV3svc
[youtube] Extracting URL: https://www.youtube.com/watch?v=ry9SYnV3svc
[youtube] ry9SYnV3svc: Downloading webpage
[youtube] ry9SYnV3svc: Downloading android sdkless player API JSON
[youtube] ry9SYnV3svc: Downloading tv client config
[youtube] ry9SYnV3svc: Downloading tv player API JSON
[youtube] ry9SYnV3svc: Downloading web safari player API JSON
[youtube] ry9SYnV3svc: Downloading player e237d4c5-main


         player = https://www.youtube.com/s/player/e237d4c5/player_ias.vflset/en_US/base.js
         n = 5yY62RU_IGIYd_i1IBb ; player = https://www.youtube.com/s/player/e237d4c5/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[youtube] ry9SYnV3svc: Downloading m3u8 information
[info] ry9SYnV3svc: Downloading 1 format(s): 251-11
[download] Destination: ./audio/audio_file
[download] 100% of    1.58MiB in 00:00:00 at 10.17MiB/s  
[ExtractAudio] Destination: ./audio/audio_file.mp3
Deleting original file ./audio/audio_file (pass -k to keep)
Audio downloaded — Duration: 2 minutes
Splitting long audio into chunks...
Split into 1 chunks.

Transcribing and summarizing audio chunks...

Processing chunk 1/1 ...
Chunk 1 transcription done. Length: 1113 chars.

=== FINAL CONCATENATED SUMMARY ===

--- Summary for chunk 1 ---
Mark is discussing his new job and expressing his positive experiences. He enjoys the friendly and helpful coworkers, the energetic and fun atmosphere, and his humorous and flexible boss. He highlights the freedom to set his own hours, arrive and leave work when he chooses, and the relaxed dress code. The other person expresses envy over the casual attire, as they dislike wearing a suit daily. The co