In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display


In [2]:
from bs4 import BeautifulSoup
import requests


# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]


def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]


In [3]:
# Initialize and constants
from openai import OpenAI
    
OLLAMA_BASE_URL = "http://localhost:11434/v1"

ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key="anything")

In [4]:
links = fetch_website_links("https://google.com")
links

['https://about.google/?fg=1&utm_source=google-IN&utm_medium=referral&utm_campaign=hp-header',
 'https://store.google.com/IN?utm_source=hp_header&utm_medium=google_ooo&utm_campaign=GS100042&hl=en-IN',
 'https://mail.google.com/mail/&ogbl',
 'https://www.google.com/imghp?hl=en&ogbl',
 'https://www.google.co.in/intl/en/about/products',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=futura_exp_og_so_72776762_e',
 'https://www.google.com/setprefs?sig=0_FV4UyrATtooKJTSNhDnVYylFEjo%3D&hl=hi&source=homepage&sa=X&ved=0ahUKEwjAgd37vcSRAxVCJrkGHcAfK1MQ2ZgBCBc',
 'https://www.google.com/setprefs?sig=0_FV4UyrATtooKJTSNhDnVYylFEjo%3D&hl=bn&source=homepage&sa=X&ved=0ahUKEwjAgd37vcSRAxVCJrkGHcAfK1MQ2ZgBCBg',
 'https://www.google.com/setprefs?sig=0_FV4UyrATtooKJTSNhDnVYylFEjo%3D&hl=te&source=homepage&sa=X&ved=0ahUKEwjAgd37vcSRAxVCJrkGHcAfK1MQ2ZgBCBk',
 'https://www.google.com/setprefs?sig=0_FV4UyrATtooKJTSNhDnVYylFEjo%3D&hl=mr&source=homepage&sa=X&ved

In [5]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
def get_links_user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [7]:
print(get_links_user_prompt("https://edwarddonner.com"))


Here is the list of links on the website https://edwarddonner.com -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/11/11/ai-live-event/
https://edwarddonner.com/2025/11/11/ai-live-event/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
htt

In [8]:
def select_relevant_links(url):
    response = ollama.chat.completions.create(
        model="llama3.2",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    return links
    

In [9]:
select_relevant_links("https://edwarddonner.com")

{'links': [{'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'connection to Nebula',
   'url': 'https://nebula.io/?utm_source=ed&utm_medium=referral'},
  {'type': 'Company page', 'url': 'https://edwarddonner.com/'}]}

In [10]:
MODEL="llama3.2"

In [11]:
def select_relevant_links(url):
    print(f"Selecting relevant links for {url} by calling {MODEL}")
    response = ollama.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links

In [12]:
select_relevant_links("https://edwarddonner.com")

Selecting relevant links for https://edwarddonner.com by calling llama3.2
Found 3 relevant links


{'links': [{'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'connect-four game',
   'url': 'https://edwarddonner.com/connect-four/'},
  {'type': 'outsmart game', 'url': 'https://edwarddonner.com/outsmart/'}]}

In [13]:
select_relevant_links("https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling llama3.2
Found 6 relevant links


{'links': [{'type': 'home page', 'url': 'https://huggingface.co/'},
  {'type': 'Company page ', 'url': 'https://discuss.huggingface.co/'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Enterprise website', 'url': 'https://endpoints.huggingface.co'}]}

## Second step: make the brochure!

Assemble all the details into another prompt to GPT-5-nano

In [14]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [15]:
from bs4 import BeautifulSoup
import requests


# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [16]:
import requests
def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]

In [17]:
print(fetch_page_and_all_relevant_links("https://huggingface.co"))

Selecting relevant links for https://huggingface.co by calling llama3.2
Found 3 relevant links
## Landing Page:

Hugging Face ‚Äì The AI community building the future.

Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
Tongyi-MAI/Z-Image-Turbo
Updated
9 days ago
‚Ä¢
297k
‚Ä¢
2.84k
microsoft/VibeVoice-Realtime-0.5B
Updated
5 days ago
‚Ä¢
159k
‚Ä¢
907
nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
Updated
1 day ago
‚Ä¢
10.5k
‚Ä¢
280
zai-org/AutoGLM-Phone-9B
Updated
8 days ago
‚Ä¢
51.6k
‚Ä¢
337
zai-org/GLM-TTS
Updated
about 5 hours ago
‚Ä¢
224
Browse 1M+ models
Spaces
Running
on
Zero
MCP
Featured
218
Chatterbox Turbo Demo
‚ö°
218
Chatterbox Turbo Demo
Running
on
Zero
MCP
Featured
193
Qwen Image to LoRA
üò≠
193
Generate LoRA from a single image
Running
on


In [18]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""



In [19]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [24]:
get_brochure_user_prompt("Google", "https://google.com")

Selecting relevant links for https://google.com by calling llama3.2
Found 4 relevant links


"\nYou are looking at a company called: Google\nHere are the contents of its landing page and other relevant pages;\nuse this information to build a short brochure of the company in markdown without code blocks.\n\n\n## Landing Page:\n\nGoogle\n\nAbout\nStore\nGmail\nImages\nSign in\nUpload image\nUpload file\nAI Mode\nSee more\nDelete\nDelete\nReport inappropriate predictions\nCannot upload. Use a file in one of these formats: .avif, .bmp, .jpeg, .pdf, .png, .webp‚Äù\nGoogle offered in:\n‡§π‡§ø‡§®‡•ç‡§¶‡•Ä\n‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ\n‡∞§‡±Ü‡∞≤‡±Å‡∞ó‡±Å\n‡§Æ‡§∞‡§æ‡§†‡•Ä\n‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç\n‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä\n‡≤ï‡≤®‡≥ç‡≤®‡≤°\n‡¥Æ‡¥≤‡¥Ø‡¥æ‡¥≥‡¥Ç\n‡®™‡©∞‡®ú‡®æ‡®¨‡©Ä\nIndia\nAdvertising\nBusiness\nHow Search works\nPrivacy\nTerms\nSettings\nSearch settings\nAdvanced search\nYour data in Search\nSearch history\nSearch help\nSend feedback\nDark theme: Off\nGoogle apps\n## Relevant Links:\n\n\n### Link: about page\nAbout Google: Our products, technology and company information - About Google\n\nJump 

In [25]:
def create_brochure(company_name, url):
    response = ollama.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [26]:
create_brochure("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling llama3.2
Found 5 relevant links


**Welcome to Hugging Face: Building the Future of Machine Learning**

At Hugging Face, we are committed to empowering the machine learning community by providing a collaboration platform for sharing, exploring, and experimenting with open-source ML. Our mission is to build an open and ethical AI future together.

**Our Approach**

We believe that artificial intelligence has the potential to drive significant impact across various industries, but it requires a collaborative effort. Our platform provides a centralized space where researchers, engineers, and end-users can share their work, discover new models and techniques, and experiment with different ML approaches.

**What We Offer**

* **Unlimited Public Models, Datasets, and Applications**: Host and collaborate on unlimited public models, datasets, and applications, move faster with our open-source stack, and explore all modalities (text, image, video, audio, or 3D).
* **Explore AI Apps**: Browse over 1 million models and trending AI apps.
* **Community-driven**: Our community is growing rapidly, with thousands of researchers, engineers, and end-users contributing to the platform.

**Our Values**

We are dedicated to building an open and ethical AI future. We believe that:

* **Collaboration is key**: By working together, we can drive innovation and impact.
* **Open-source is essential**: Open-source ML libraries and frameworks enable transparency, reproducibility, and widespread adoption.
* ** Ethical AI is mandatory**: We prioritize fairness, accountability, and transparency in our development processes.

**Our Team**

Hugging Face is built by a diverse team of machine learning engineers, researchers, and enthusiasts. Our team is passionate about empowering the next generation of machine learning professionals to build an open and ethical AI future together.

**Join Us**

Want to be part of the Hugging Face community? Check out our **Careers & Jobs** page to explore opportunities that align with your skills and interests.

## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [39]:
def stream_brochure(company_name, url):
    stream = ollama.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [40]:
stream_brochure("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling llama3.2
Found 6 relevant links


# Hugging Face: Empowering the AI Community

[Image of a group of people collaborating]

Hugging Face is the leading platform for building and collaborations in machine learning. Our mission is to accelerate innovation in AI while providing enterprise-grade security, access controls, and dedicated support.

## About Us

Hugging Face is home to over 1 million publicly available models, datasets, and applications. Our platform offers a wide range of capabilities, including text, image, video, audio, and even 3D modalities. With our open-source stack, users can move faster and build their portfolios with ease.

## Community

We have a thriving community of ML practitioners, researcher, and developers who collaborate on public models, datasets, and applications. Our community-driven model hub features latest models updated every day, and we actively foster collaboration among users through regular blog posts, webinars and events.

Our space is designed to make it easy for you to connect with others who share your interests and goals - browse the spaces where you can start working on a project and get feedback from other community members.

## Enterprise Solutions

We offer flexible, contract-based solutions for organizations looking to scale their AI initiatives. With Hugging Face, your organization will gain access to cutting-edge security features, single sign-on integration, region management and dedicated support - all while staying top of line with industry standards.

## Careers & Jobs

Careers at Hugging Face: We are committed to fostering a diverse, inclusive, and culturally aware community. Our ideal candidate will be able to work collaboratively with our team, share ideas and collaborate on projects that help advance the field of AI. Check out our current job openings:

*   AI Engineer
*   Researcher
*   Data Scientist

[Link to apply]

## Pricing & Packages

We offer two major packages for Hugging Face: a starter plan at $20/user/month, making it easy for you and your team to begin working with our platform. But we also have an Enterprise package for companies seeking a customized solution that requires more security features.

Visit the following link for all package details:

[Link to pricing & packages]