### 1. Webscrping phase

In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [3]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-

In [6]:
# display(Markdown(ed.get_contents()))

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [11]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [None]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddo

In [6]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [14]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/microsoft/VibeVoice-1.5B',
 '/openbmb/MiniCPM-V-4_5',
 '/tencent/Hunyuan-MT-7B',
 '/meituan-longcat/LongCat-Flash-Chat',
 '/tencent/HunyuanWorld-Voyager',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/apple/fastvlm-webgpu',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/bytedance-research/USO',
 '/spaces/multimodalart/Qwen-Image-Edit-Fast',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/openai/healthbench',
 '/datasets/syncora/developer-productivity-simulated-behavioral-data',
 '/datasets/data-agents/jupyter-agent-dataset',
 '/datasets/facebook/recycling_the_web',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/mic

In [15]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [7]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [17]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'contact page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
microsoft/VibeVoice-1.5B
Updated
2 days ago
•
150k
•
1.31k
openbmb/MiniCPM-V-4_5
Updated
about 7 hours ago
•
16.2k
•
841
tencent/Hunyuan-MT-7B
Updated
about 1 hour ago
•
1.4k
•
407
meituan-longcat/LongCat-Flash-Chat
Updated
3 days ago


In [8]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [9]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [None]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'community discussion page', 'url': 'https://discuss.huggingface.co'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nblack-forest-labs/FLUX.1-Kontext-dev\nUpdated\nabout 13 hours ago\n•\n12.9k\n•\n807\ntencent/Hunyuan-A13B-Instruct\nUpdated\nabout 8 hours ago\n•\n338\ngoogle/magenta-realtime\nUpdated\n5 days ago\n•\n388\nnanonets/Nanonets-OCR-s\nUpdated\n8 days ago\n•\n202k\n•\n1.22k\ngoogle/gemma-3n-E4B-it\nUpdated\n1 day ago\n•\n5.55k\n•\n215\nBrowse 1M+ models\nSpaces\nRunning\n8.74k\n8.74k\nDeepSite 

In [10]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [31]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community discussion page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}


```markdown
# Hugging Face Brochure

## Company Overview
### The AI Community Building the Future
Hugging Face is at the forefront of the machine learning community, providing a collaborative platform where individuals and organizations come together to develop and share models, datasets, and applications. With over a million models and thousands of datasets, Hugging Face is a hub for innovation and creativity in artificial intelligence.

## Key Offerings
- **Models**: Access and collaborate on a vast library of over 1 million models, including state-of-the-art solutions for text, image, and audio processing.
- **Datasets**: Utilize and contribute to a repository of over 250,000 datasets tailored for various ML tasks.
- **Spaces**: Create and host applications on the platform, ensuring real-time collaboration and feedback from the community.
- **Enterprise Solutions**: Tailored services for organizational needs, including security, support, and dedicated resources.

## Core Values
- **Collaboration**: Hugging Face believes in the power of community, where users can come together to enhance their skills and knowledge in machine learning.
- **Accessibility**: The platform champions open-source initiatives, ensuring a wide range of tools and resources are available to everyone.
- **Innovation**: Continuous development of cutting-edge tools and environments to support evolving technologies in AI.

## Customer Base
More than 50,000 organizations trust Hugging Face, including leading names like:
- **Google**
- **Amazon**
- **Meta**
- **Microsoft**
- **Intel**

These partnerships demonstrate our commitment to providing top-notch support and resources to professionals in the field.

## Company Culture
At Hugging Face, we foster an inclusive and passionate workplace where every team member is encouraged to share their ideas and contribute to projects. Our culture emphasizes:
- **Learning and Growth**: Regular workshops and training sessions for professional development.
- **Community First**: A belief that collaboration drives innovation, making team and community contributions invaluable.
- **Support and Recognition**: A supportive environment where achievements are celebrated and contributions are recognized, enabling individuals to thrive.

## Careers at Hugging Face
We are always seeking talented individuals who share our vision of building the future of AI. Open positions span across various domains including:
- Engineering
- Product Management
- Community Engagement
We value diverse experiences, perspectives, and skills, and we invite you to explore opportunities to join our team.

## Join Us!
Whether you're a prospective customer, investor, or job seeker, Hugging Face welcomes you to explore our platform and engage with a community that is shaping the future of AI.

**Contact Us:**
[Hugging Face Website](https://huggingface.co)  
Connect with us via our social channels:
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://www.linkedin.com/company/huggingface)
- [Discord](https://discord.com/invite/huggingface)
```


## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [11]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        yield response
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [33]:
stream_brochure("HuggingFace", "https://huggingface.co")

<generator object stream_brochure at 0x7db7896ff880>

In [26]:
import gradio as gr

In [40]:
view = gr.Interface(
    fn=stream_brochure,
    inputs=[gr.Textbox(label="Company Name"), gr.Textbox(label="Company URL")],
    outputs=[gr.Markdown(label="Response:")],
    flagging_mode="never"
)
view.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f03d23ec0200c6331e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### Video summary phase

In [None]:
# !pip install yt_dlp
# !pip install -U langchain-community
# !pip install faiss-cpu
# !pip install langchain_openai

Collecting yt_dlp
  Downloading yt_dlp-2025.8.27-py3-none-any.whl.metadata (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.8.27-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp
Successfully installed yt_dlp-2025.8.27
Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=804c3d829ee4629f54ef089ed841f59997328b202c29d163787d001d5176063d
  Stored in directory: /root/

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
^C


In [12]:
import os
import tempfile
import gradio as gr
import yt_dlp
import openai
import whisper
import numpy as np
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re
import glob
from typing import List, Tuple
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [13]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [26]:
class YouTubeProcessor:
    def __init__(self):
        self.embeddings = None
        self.vector_store = None
        self.conversation_chain = None
        self.memory = None
        self.MODEL = "gpt-4o-mini"
        self.AUDIO_MODEL = "whisper-1"
    def load_models(self):
        if self.embeddings is None:
            self.embeddings = OpenAIEmbeddings()

    def extract_video_id(self, url) -> str:
        patterns = [
            r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&\n?#]+)',
            r'(?:https?:\/\/)?(?:www\.)?youtu\.be\/([^&\n?#]+)',
            r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/embed\/([^&\n?#]+)',
        ]
        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)
        raise ValueError("Invalid YouTube URL")

    def download_audio(self, youtube_url, progress_callback=None):
        if progress_callback:
            progress_callback(0.1, "Downloading audio from YouTube...")

        video_id = self.extract_video_id(youtube_url)
        permanent_path = f"temp_audio_{video_id}.mp3"
        output_template = f"temp_audio_{video_id}"
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': output_template + '.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }

        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.download([youtube_url])
            if os.path.exists(permanent_path):
                return permanent_path
            else:
                import glob
                matching_files = glob.glob(f"temp_audio_{video_id}.*")
                if matching_files:
                    os.rename(matching_files[0], permanent_path)
                    return permanent_path
                else:
                    raise FileNotFoundError(f"Downloaded audio file not found for video {video_id}")

        except Exception as e:
            import glob
            for file in glob.glob(f"temp_audio_{video_id}*"):
                try:
                    os.remove(file)
                except:
                    pass
            raise e

    def transcribe_audio(self, audio_path, progress_callback=None):
        if progress_callback:
            progress_callback(0.4, "Transcribing audio with Whisper...")

        audio_file = open(audio_path, "rb")
        transcription = openai.audio.transcriptions.create(model=self.AUDIO_MODEL, file=audio_file, response_format="text")
        return transcription

    def create_vector_store(self, text, progress_callback=None):
        if progress_callback:
            progress_callback(0.7, "Creating vector embeddings...")

        self.load_models()
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
            length_function=len,
        )

        chunks = text_splitter.split_text(text)
        documents = [Document(page_content=chunk) for chunk in chunks]
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
        self.setup_conversation_chain()

        return self.vector_store

    def setup_conversation_chain(self):
        if self.vector_store is not None:
            llm = ChatOpenAI(temperature=0.7, model_name=self.MODEL)
            self.memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
            retriever = self.vector_store.as_retriever()
            self.conversation_chain = ConversationalRetrievalChain.from_llm(
                llm=llm,
                retriever=retriever,
                memory=self.memory
            )

    def generate_summary(self, text: str, progress_callback=None) -> str:
        if progress_callback:
            progress_callback(0.9, "Generating summary with GPT...")
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                    {
                        "role": "system",
                        "content": """ You are a helpful assistant that creates comprehensive summaries of video transcripts.

                        IMPORTANT INSTRUCTIONS:
                        1. Analyze the language of the provided transcript
                        2. Write the summary in the SAME LANGUAGE as the transcript
                        3. If the transcript is in Arabic, write the summary in Arabic
                        4. If the transcript is in English, write the summary in English
                        5. If the transcript is in Spanish, write the summary in Spanish
                        6. And so on for any other language

                        Create a well-structured summary that includes:
                        - Main topic and purpose of the video
                        - Key points discussed
                        - Important details and examples
                        - Conclusions or takeaways

                        Keep the summary comprehensive but concise, and maintain the same tone and formality level as the original content."""
                    },
                    {
                        "role": "user",
                        "content": f"Please provide a comprehensive summary using the same language of the following video transcript:\n\n{text[:4000]}..."
                    }
            ],
            max_tokens=300,
            temperature=0.7
            )
        return response.choices[0].message.content

    def chat_with_video(self, question: str):
        if self.conversation_chain is None:
            return "No video processed yet. Please process a video first."

        response = self.conversation_chain({"question": question})
        return response['answer']

    def reset_conversation(self):
        if self.memory is not None:
            self.memory.clear()
            return "Conversation history cleared!"
        return "No conversation to reset."

    def process_video(self, youtube_url: str, progress_callback=None) -> Tuple[str, str, str]:
      video_id = self.extract_video_id(youtube_url)
      audio_path = self.download_audio(youtube_url, progress_callback)
      transcript = self.transcribe_audio(audio_path, progress_callback)
      self.create_vector_store(transcript, progress_callback)
      summary = self.generate_summary(transcript, progress_callback)
      if os.path.exists(audio_path):
        os.remove(audio_path)

      if progress_callback:
        progress_callback(1.0, "Processing complete!")
        return transcript, summary, "Processing completed successfully!"

In [27]:
processor = YouTubeProcessor()

In [28]:
def process_youtube_video(youtube_url, progress=gr.Progress()):
    def progress_callback(value, message):
        progress(value, desc=message)
    if not youtube_url.strip():
        return "", "", "Please enter a YouTube URL"
    transcript, summary, status = processor.process_video(youtube_url, progress_callback)
    return transcript, summary, status

def chat_with_video(message, history):
    """Chat function for conversational interface"""
    if not message.strip():
        return history, ""

    response = processor.chat_with_video(message)
    history.append([message, response])
    return history, ""

def reset_chat():
    """Reset the conversation"""
    reset_msg = processor.reset_conversation()
    return [], reset_msg

In [34]:
with gr.Blocks(title="AI Researcher & Educational") as interface:
    gr.Markdown("# 🤖 Mulitmodal AI Researcher & Educational")
    with gr.Tab("Process Video"):
        with gr.Row():
            with gr.Column(scale=2):
                url_input = gr.Textbox(
                    label="YouTube URL",
                    placeholder="https://www.youtube.com/watch?v=...",
                    lines=1
                )
                process_btn = gr.Button("Process Video", variant="primary")

            with gr.Column(scale=1):
                status_output = gr.Textbox(
                    label="Status",
                    lines=3,
                    interactive=False
                )
        with gr.Row():
            with gr.Column():
                transcript_output = gr.Textbox(
                    label="Full Transcript",
                    lines=15,
                    max_lines=20,
                    interactive=False
                )

            with gr.Column():
                summary_output = gr.Textbox(
                    label="Summary",
                    lines=15,
                    max_lines=20,
                    interactive=False
                )
    with gr.Tab("Chat with Video"):
        gr.Markdown("### Have a conversation with the video content")

        with gr.Row():
            with gr.Column(scale=4):
                chatbot = gr.Chatbot(
                    label="Video Chat",
                    height=400,
                    show_label=True
                )

                with gr.Row():
                    chat_input = gr.Textbox(
                        label="Ask a question about the video",
                        placeholder="What is this video about?",
                        lines=1,
                        scale=4
                    )
                    chat_btn = gr.Button("Send", variant="primary", scale=1)

            with gr.Column(scale=1):
                reset_btn = gr.Button("Reset Conversation", variant="secondary")
                reset_status = gr.Textbox(
                    label="Reset Status",
                    lines=2,
                    interactive=False
                )
    with gr.Tab("Company Brochure Websraping"):
      gr.Markdown("### Generate professional brochure content for any company")
      brochure_interface = gr.Interface(
          fn=stream_brochure,
          inputs=[
              gr.Textbox(label="Company Name", placeholder="Enter company name..."),
              gr.Textbox(label="Company URL", placeholder="Enter company website ...")
          ],
          outputs=[gr.Markdown(label="Response:")],
          flagging_mode="never",
      )

      process_btn.click(
          fn=process_youtube_video,
          inputs=[url_input],
          outputs=[transcript_output, summary_output, status_output]
      )

    chat_btn.click(
        fn=chat_with_video,
        inputs=[chat_input, chatbot],
        outputs=[chatbot, chat_input]
    )

    chat_input.submit(
        fn=chat_with_video,
        inputs=[chat_input, chatbot],
        outputs=[chatbot, chat_input]
    )

    reset_btn.click(
        fn=reset_chat,
        outputs=[chatbot, reset_status]
    )

  chatbot = gr.Chatbot(


In [35]:
interface.launch(
        share=True,
        show_error=True
    )

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://461b37d0f436bee596.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


