In [49]:
import os
import whisper
from transformers import pipeline
import yt_dlp as youtube_dl
from bertopic import BERTopic
from tqdm import tqdm
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
# Function to download the audio using yt-dlp
def download_audio(video_link, output_audio_path):
    ydl_opts = {
        "format": "bestaudio/best",
        "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "wav", "preferredquality": "192"}],
        "postprocessor_args": ["-ar", "16000"],
        "prefer_ffmpeg": True,
        "keepvideo": False,
        "outtmpl": output_audio_path + ".%(ext)s",
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_link])

In [51]:
# Function to transcribe audio and extract segments using Whisper
def transcribe_audio_with_timestamps(audio_path, model_name="base"):
    model = whisper.load_model(model_name)
    print("Transcribing audio...")
    result = model.transcribe(audio_path)
    transcription = result["text"]
    segments = result["segments"]  # Contains timestamps per segment
    return transcription, segments

In [52]:
# Summarization using transformer-based summarization (e.g., BART)
def summarize_text_in_chunks(text, chunk_size=1024):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

    # Split text into chunks
    text_chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
    summaries = []

    print("Summarizing text in chunks...")
    for chunk in tqdm(text_chunks, desc="Summarizing"):
        summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
        summaries.append(summary)

    return " ".join(summaries)

In [53]:
# Function to dynamically segment the transcription into topics using BERTopic
def topic_segmentation(segments):
    texts = [segment["text"] for segment in segments]
    topic_model = BERTopic()  # Initialize BERTopic model

    print("Segmenting topics...")
    topics, _ = topic_model.fit_transform(texts)

    # Group segments by topics with timeframes
    topic_segments = {}
    for idx, topic in enumerate(topics):
        if topic not in topic_segments:
            topic_segments[topic] = {
                "text": [],
                "start_time": segments[idx]["start"],
                "end_time": segments[idx]["end"],
            }
        topic_segments[topic]["text"].append(segments[idx]["text"])
        topic_segments[topic]["end_time"] = segments[idx]["end"]  # Update the end time

    return topic_segments, topic_model

In [54]:
# Generate summaries for each segmented topic
def summarize_topics(topic_segments, chunk_size=1024):
    topic_summaries = {}
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

    print("Summarizing topics...")
    for topic, data in tqdm(topic_segments.items(), desc="Summarizing each topic"):
        full_text = " ".join(data["text"])  # Combine all texts for this topic

        # Split the full text into chunks to handle large inputs
        text_chunks = [full_text[i : i + chunk_size] for i in range(0, len(full_text), chunk_size)]
        summaries = []

        for chunk in text_chunks:
            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]["summary_text"]
            summaries.append(summary)

        # Combine chunk summaries and store them
        topic_summaries[topic] = {
            "summary": " ".join(summaries),
            "start_time": data["start_time"],
            "end_time": data["end_time"],
            "full_text": full_text,  # Added for RAG retrieval
        }

    return topic_summaries

In [55]:
# Main function to process video, transcribe, segment by topic, summarize, and enable Q&A with ChatGPT
def process_lecture_video(video_link, audio_output_path):
    # Step 1: Download the audio from the video
    print("STEP 1: Downloading audio...")
    download_audio(video_link, audio_output_path)

    # Step 2: Transcribe the audio and get timestamps for each segment
    print("\nSTEP 2: Transcribing audio...")
    transcription, segments = transcribe_audio_with_timestamps(audio_output_path + ".wav")

    # Step 3: Generate overall summary of the lecture
    print("\nSTEP 3: Generating overall summary...")
    overall_summary = summarize_text_in_chunks(transcription, chunk_size=1024)

    # Step 4: Segment the transcription by topics dynamically
    print("\nSTEP 4: Segmenting transcription into topics...")
    topic_segments, topic_model = topic_segmentation(segments)

    # Step 5: Summarize each segmented topic
    print("\nSTEP 5: Summarizing each topic...")
    topic_summaries = summarize_topics(topic_segments)

    return overall_summary, topic_summaries, topic_model

In [56]:
# Set up OpenAI API Key for ChatGPT
client = OpenAI(
    api_key="OPENAI_API_KEY",
    # api_key=os.environ.get("OPENAI_API_KEY"),  # Ensure your API key is set in the environment variable
)


# Function to create embeddings for segments
def create_embeddings(topic_summaries):
    print("Creating embeddings for topic summaries...")
    embeddings = {}
    for topic, data in tqdm(topic_summaries.items(), desc="Embedding topics"):
        response = client.embeddings.create(
            input=data["full_text"],
            model="text-embedding-ada-002",
        )
        embeddings[topic] = {
            "embedding": response["data"][0]["embedding"],
            "start_time": data["start_time"],
            "end_time": data["end_time"],
            "text": data["full_text"],
            "summary": data["summary"],
        }
    return embeddings


# Function to handle queries using RAG
def query_with_rag(question, embeddings):
    print("Processing query with RAG...")
    # Create embedding for the question
    question_embedding_response = client.embeddings.create(
        input=question,
        model="text-embedding-ada-002",
    )
    question_embedding = question_embedding_response["data"][0]["embedding"]

    # Compute similarities
    similarities = []
    for topic, data in embeddings.items():
        sim = cosine_similarity(
            [question_embedding],
            [data["embedding"]],
        )[
            0
        ][0]
        similarities.append((sim, topic))

    # Get the most relevant topics
    similarities.sort(reverse=True)
    top_topics = [topic for _, topic in similarities[:3]]  # Get top 3 relevant topics

    # Combine the texts of the most relevant topics
    context = " ".join([embeddings[topic]["text"] for topic in top_topics])

    # Use the context to answer the question
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {context}"},
            {"role": "user", "content": f"Question: {question}"},
        ],
        model="gpt-4o",
    )

    answer = response["choices"][0]["message"]["content"].strip()

    # Get relevant timeframes
    relevant_timeframes = [(embeddings[topic]["start_time"], embeddings[topic]["end_time"]) for topic in top_topics]

    return answer, relevant_timeframes

In [57]:
# Example Usage:
video_link = "https://www.youtube.com/watch?v=AhyznRSDjw8"
audio_output_path = "downloaded_audio"

In [58]:
# # Process the lecture video to get summaries and topics
# overall_summary, topic_summaries, topic_model = process_lecture_video(video_link, audio_output_path)

In [59]:
# Step 1: Download the audio from the video
print("STEP 1:....")
download_audio(video_link, audio_output_path)

STEP 1:....
[youtube] Extracting URL: https://www.youtube.com/watch?v=AhyznRSDjw8
[youtube] AhyznRSDjw8: Downloading webpage
[youtube] AhyznRSDjw8: Downloading ios player API JSON
[youtube] AhyznRSDjw8: Downloading web creator player API JSON
[youtube] AhyznRSDjw8: Downloading m3u8 information
[info] AhyznRSDjw8: Downloading 1 format(s): 251
[download] Destination: downloaded_audio.webm
[download] 100% of   45.54MiB in 00:00:06 at 7.10MiB/s     
[ExtractAudio] Destination: downloaded_audio.wav
Deleting original file downloaded_audio.webm (pass -k to keep)


In [60]:
# Step 2: Transcribe the audio and get timestamps for each segment
print("STEP 2:....")
transcription, segments = transcribe_audio_with_timestamps(audio_output_path + ".wav")

STEP 2:....
Transcribing audio...


In [61]:
# Step 3: Generate overall summary of the lecture
print("STEP 3:....")
overall_summary = summarize_text_in_chunks(transcription, chunk_size=1024)

STEP 3:....
Summarizing text in chunks...


Summarizing: 100%|██████████| 54/54 [04:41<00:00,  5.20s/it]


In [62]:
# Step 4: Segment the transcription by topics dynamically
print("STEP 4:....")
topic_segments, topic_model = topic_segmentation(segments)

STEP 4:....
Segmenting topics...


In [63]:
# Step 5: Summarize each segmented topic
print("STEP 5:....")
topic_summaries = summarize_topics(topic_segments)

STEP 5:....
Summarizing topics...


Summarizing each topic:   5%|▌         | 1/20 [00:03<01:09,  3.65s/it]Your max_length is set to 150, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Summarizing each topic:  20%|██        | 4/20 [00:30<02:05,  7.82s/it]Your max_length is set to 150, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Summarizing each topic:  30%|███       | 6/20 [01:51<06:51, 29.38s/it]Your max_length is set to 150, but your input_length is only 93. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)
Summarizing each topic:  35%|███▌      | 7/

In [64]:
# Output the overall summary and topic summaries
print("Overall Summary of the lecture:", overall_summary)
for topic, summary in topic_summaries.items():
    print(f"Topic {topic}: {summary['summary']} (Time: {summary['start_time']} to {summary['end_time']})")

Overall Summary of the lecture: Two lectures today are really exciting because they start to move beyond a lot of what we've talked about in the class so far, which is focusing a lot on really static data sets. In today, in this lecture right now, I'm going to start to talk about how we can learn about this very long-standing field of reinforcement learning. In the real world, you have your deep learning model actually deployed together with the data. This is the key motivation of reinforcement learning. You're going to try and learn through reinforcement, making mistakes in your world, and then collecting data on those mistakes to learn how to improve. "I'm hoping for a 5-0 not to lose any games, but I think the realistic goal would be 4-1 in my favour" "I wasn't expecting that good. Everything that he did was proper. It was calculated and it was done well" We've really covered two different types of learning in this course to date. Supervised learning is in this domain where we're gi

In [65]:
# Create embeddings for RAG
embeddings = create_embeddings(topic_summaries)
    
# Example: Ask a query and get answer with timeframe
question = "What is reward function?"
answer, timeframe = query_with_rag(question, embeddings)
print(f"Answer: {answer}, Relevant Timeframe: {timeframe}")

Creating embeddings for topic summaries...


Embedding topics:   0%|          | 0/20 [00:00<?, ?it/s]


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: OPENAI_A**_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}