In [57]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PROJECT DESCRIPTION

Put a youtube video link, it will generate summary of the video as well as let you ask queries related to the video.

> For test run click on the "**Run All**" button, input the url below and you can find results in "**Results**" section of the notebook

In [58]:
yt_url = input("Input YouTube URL")
# Example: https://youtu.be/ad79nYk2keg

Input YouTube URL https://youtu.be/ad79nYk2keg


# Importing initial packages

**Pakages required:**

In [59]:
!pip install langchain docarray==0.38.0 yt_dlp openai-whisper transformers==4.44.0

Collecting pydantic<2.0.0,>=1.10.2 (from docarray==0.38.0)
  Using cached pydantic-1.10.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (152 kB)
INFO: pip is looking at multiple versions of langchain to determine which version is compatible with other requirements. This could take a while.
Collecting langchain
  Using cached langchain-0.3.8-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.6-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.5-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
  Using cached langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
INFO: pip is still looking at multiple versions of langchain to determine which version is compatible with other requirements. This could take a while.
  Using cached langchain-0.3.1-py3-none-any.whl.met

In [60]:
!pip install -U langchain-community

Collecting langchain<0.4.0,>=0.3.8 (from langchain-community)
  Using cached langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain-community)
  Using cached langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain<0.4.0,>=0.3.8->langchain-community)
  Using cached langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain<0.4.0,>=0.3.8->langchain-community)
  Using cached pydantic-2.10.2-py3-none-any.whl.metadata (170 kB)
Using cached langchain-0.3.9-py3-none-any.whl (1.0 MB)
Using cached langchain_core-0.3.21-py3-none-any.whl (409 kB)
Using cached langchain_text_splitters-0.3.2-py3-none-any.whl (25 kB)
Using cached pydantic-2.10.2-py3-none-any.whl (456 kB)
Installing collected packages: pydantic, langchain-core, langchain-text-splitters, langchain
  Attempting uninstall: pydantic
    Found existing installation

In [61]:
import os
import glob
from pathlib import Path
import yt_dlp 
import whisper

# Download Audio

In [62]:
def download_audio(yt_url):

    # Create a directory to save the downloaded audio files
    output_dir = "/kaggle/working/files/audio"
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # yt-dlp configuration
    ydl_config = {
        "format": "bestaudio/best",  # Best audio quality available
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",  # Use FFmpeg for audio extraction
                "preferredcodec": "wav",     # Save as wav format
                "preferredquality": "192",   # Optional, relevant for codecs like mp3
            }
        ],
        "outtmpl": os.path.join(output_dir, "%(title)s.%(ext)s"),  # Save file with title as name
        "verbose": True  # Show download progress in output
    }
    
    print(f"Downloading audio from {yt_url}")
    
    # Attempt to download the audio
    try:
        with yt_dlp.YoutubeDL(ydl_config) as ydl:
            ydl.download([yt_url])
            print("Downloading successful!")
    except Exception as e:
        print(f"Error downloading audio: {e}")

# Transcription of downloaded audio file

In [63]:
def get_audiofile_path():
    # finding all downloaded mp3 files
    audio_file = glob.glob(os.path.join(output_dir, "*.wav"))
    
    # selecting first file (recent one)
    audio_filepath = audio_file[-1]
    
    return audio_filepath

# Whisper to transcribe

In [64]:
# to ignore unecessary warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="whisper")
warnings.filterwarnings("ignore", category=FutureWarning, module="torch")

In [65]:
def transcribe_with_whisper(audio_path):
    model = whisper.load_model("small")  # small - low accuracy, high speed; medium; large
    result = model.transcribe(audio_path)
    return result['text']

In [66]:
def save_text(text, output_dir):  
    # Write the string to the file
    with open(output_dir, "w") as file:
        file.write(text)
    
    print(f"Text successfully saved to {output_dir}")

# Summary Generation

TextLoader (langchain) -> tokenisation -> summary pipeline -> output

In [67]:
from langchain.document_loaders import TextLoader
from transformers import pipeline, T5Tokenizer
import tiktoken  # OpenAI's tokenizer for token count
from math import ceil

# Load the summarization pipeline (using Hugging Face T5 model in this example)
summarizer = pipeline("summarization", model="t5-small")

# Load the T5 tokenizer (for token count)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Function to split text into chunks based on token count using T5's tokenizer
def split_text_into_chunks(text, max_tokens=512):
    # Encode the text to get the token IDs
    tokens = tokenizer.encode(text)
    
    # Calculate how many chunks are needed
    num_chunks = ceil(len(tokens) / max_tokens)
    chunks = []

    for i in range(num_chunks):
        # Slice the tokens into chunks of max_tokens length
        chunk = tokens[i * max_tokens: (i + 1) * max_tokens]
        
        # Check if the chunk is within the allowed limit (512 tokens including special tokens)
        # If the chunk is larger than the max tokens, we truncate it
        if len(chunk) > max_tokens:
            chunk = chunk[:max_tokens]
        
        # Decode the chunk and append to chunks list
        decoded_chunk = tokenizer.decode(chunk, skip_special_tokens=True)

        chunks.append(decoded_chunk)
    
    return chunks

def generate_summary(file_path):
    # Load text file using TextLoader
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Get the text content from the loaded documents
    text = documents[0].page_content  # Assuming it's the first document if multiple
    
    # Split the text into manageable chunks (adjust max_tokens to your model's limit)
    chunks = split_text_into_chunks(text, max_tokens=505)
    
    # Summarize each chunk individually
    summaries = []
    for chunk in chunks:
        # Handle potential issues with empty chunks or excessive length
        if len(chunk.strip()) > 0:
            summary = summarizer(chunk)
            summaries.append(summary[0]['summary_text'])
    
    # Combine all the summaries to get the final result
    final_summary = " ".join(summaries)

    print("Summary generated succesfully!")
    return final_summary

**Creating document search**

In [80]:
def qa(file_path, question):
    # Use a standard QA model for plain text (SQuAD fine-tuned)
    qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
    
    # Function to read text from the file
    def read_text_from_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    
    # Function to answer a question from the document (text file)
    def answer_question_from_text(text, question):
        # Use the pipeline to get the answer from the context (text file)
        answer = qa_pipeline(question=question, context=text)
        return answer['answer']
    
    text = read_text_from_file(file_path)
    answer = answer_question_from_text(text, question)
    
    return "Answer:"+answer

# Results

In [70]:
download_audio(yt_url)

[debug] Encodings: locale UTF-8, fs utf-8, pref UTF-8, out UTF-8 (No ANSI), error UTF-8 (No ANSI), screen UTF-8 (No ANSI)
[debug] yt-dlp version stable@2024.11.18 from yt-dlp/yt-dlp [7ea278792] (pip) API
[debug] params: {'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192'}], 'outtmpl': '/kaggle/working/files/audio/%(title)s.%(ext)s', 'verbose': True, 'compat_opts': set(), 'http_headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Sec-Fetch-Mode': 'navigate'}}
[debug] Python 3.10.14 (CPython x86_64 64bit) - Linux-6.6.56+-x86_64-with-glibc2.31 (OpenSSL 3.3.2 3 Sep 2024, glibc 2.31)
[debug] exe versions: ffmpeg 4.2.7, ffprobe 4.2.7
[debug] Optional libraries: brotli-None, certifi-2024.08.30, pycrypto-3.20.0, requ

Downloading audio from https://youtu.be/ad79nYk2keg
[youtube] Extracting URL: https://youtu.be/ad79nYk2keg
[youtube] ad79nYk2keg: Downloading webpage
[youtube] ad79nYk2keg: Downloading ios player API JSON
[youtube] ad79nYk2keg: Downloading mweb player API JSON


[debug] Loading youtube-nsig.b46bb280 from cache
[debug] [youtube] Decrypted nsig 8tg3Tlv-8MZGSQp_xiS => ItvNEIjqSKmsMg


[youtube] ad79nYk2keg: Downloading m3u8 information


[debug] Sort order given by extractor: quality, res, fps, hdr:12, source, vcodec, channels, acodec, lang, proto
[debug] Formats sorted by: hasvid, ie_pref, quality, res, fps, hdr:12(7), source, vcodec, channels, acodec, lang, proto, size, br, asr, vext, aext, hasaud, id


[info] ad79nYk2keg: Downloading 1 format(s): 251


[debug] Invoking http downloader on "https://rr4---sn-un57enez.googlevideo.com/videoplayback?expire=1733193527&ei=1xpOZ_j-COXgs8IPhNLtsAo&ip=34.80.255.247&id=o-ADVTXnOiDI1Bx1bQXMRpFcKV4iOvKfbT7E9JcNJ5TJTb&itag=251&source=youtube&requiressl=yes&xpc=EgVo2aDSNQ%3D%3D&met=1733171927%2C&mh=zv&mm=31%2C26&mn=sn-un57enez%2Csn-a5meknzs&ms=au%2Conr&mv=u&mvi=4&pl=25&rms=au%2Cau&bui=AQn3pFTWYB0yB2EKENneO7anp09xS_TKm_2b90GyYRgBIJ1ApQOXNAA0x2tsAIK36D97m1FeuksY5qIQ&spc=qtApAT8Np6yvRTCG48Xbb9fFx6mXfrL7CC_0N0jQ53OVogY&vprv=1&svpuc=1&mime=audio%2Fwebm&rqh=1&gir=yes&clen=5664807&dur=327.821&lmt=1712140764252870&mt=1733171383&fvip=1&keepalive=yes&fexp=51326932%2C51335594&c=IOS&txp=4532434&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cxpc%2Cbui%2Cspc%2Cvprv%2Csvpuc%2Cmime%2Crqh%2Cgir%2Cclen%2Cdur%2Clmt&sig=AJfQdSswRQIhAK0gudk57QfYWRKDBRGJptJhOzYAi8uZsYed41G5mS_0AiAmT9wQznbPeSth0Y7ncoIclNvUBc-q1g_a1oZgDeo-sg%3D%3D&lsparams=met%2Cmh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Crms&lsig=AGluJ3MwRQIhAKEMw-2

[download] Destination: /kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.webm
[download] 100% of    5.40MiB in 00:00:00 at 27.79MiB/s  


[debug] ffmpeg command line: ffprobe -show_streams 'file:/kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.webm'


[ExtractAudio] Destination: /kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.wav


[debug] ffmpeg command line: ffmpeg -y -loglevel repeat+info -i 'file:/kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.webm' -vn -b:a 192.0k -movflags +faststart 'file:/kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.wav'


Deleting original file /kaggle/working/files/audio/What Is AI？ ｜ Artificial Intelligence ｜ What is Artificial Intelligence？ ｜ AI In 5 Mins ｜Simplilearn.webm (pass -k to keep)
Downloading successful!


In [71]:
## it's processing time depends upon the length of the youtube video link you provide

audio_filepath = get_audiofile_path()
transcribed_text = transcribe_with_whisper(audio_filepath)

  checkpoint = torch.load(fp, map_location=device)


In [72]:
# preview of transcribed file
transcribed_text[:len(transcribed_text)//10]

" Picture this, a machine that could organize your cupboard just as you like it, or serve every member of the house a customized cup of coffee. Makes your day easier, doesn't it? These are the products of artificial intelligence. But why use the term artificial intelligence? Well, these machines are artificially incorporated with human-like intelligence to perform tasks as we do. This intelligence is built using complex algorithm"

In [73]:
# to save transcribed text in output directory, return
output_dir = "/kaggle/working/files/output.txt"
text = transcribed_text
save_text(text, output_dir)

Text successfully saved to /kaggle/working/files/output.txt


In [74]:
file_path = output_dir
summary = generate_summary(file_path)

Token indices sequence length is longer than the specified maximum sequence length for this model (1006 > 512). Running this sequence through the model will result in indexing errors


Summary generated succesfully!


In [75]:
print(summary)

artificial intelligence is used in smartphones, cars, social media feeds, video games, banking, surveillance, and many other aspects of our daily life . the robot is now at a crossroad, one that is paved, and the other, rocky . this portrays the robot's reasoning ability . AI is a subset of machine learning and deep learning . deep learning allows a machine to learn from data and experience through algorithms . this means, through deep learning, data and patterns can be better perceived . three lucky winners will receive Amazon gift vouchers .


In [85]:
num_of_questiones = 3
while num_of_questiones:
    question = input("What is your question: ")
    # Example: What is the topic?
    num_of_questiones -= 1
    print(qa(file_path, question))

# As this only fetches answer from the video provided,hence if you ask anything 
# that is not been mentioned in the video, you possibly might get wrong answer!

# for exmapple I'll ask two questions within the video and 1 outside of the video
# see how it can't recognise the cons as it wasn't mentioned in the video

What is your question:  What is the topic of the video


Answer:artificial intelligence


What is your question:  What are the pros of AI


Answer:AI provides machines with the capability to adapt, reason, and provide solutions


What is your question:  What are the cons of AI


Answer:AI provides machines with the capability to adapt, reason, and provide solutions


# Further scope of improvements

- Training model with multiple videos or can say make a customised question answer model by putting the topic name and automatically fetching top 10 video about the topic and save the model for any queries related to the topic
- Can be further improved by reinforcement learning