In [1]:
# imports
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

ModuleNotFoundError: No module named 'requests'

In [None]:
MODEL = 'llama3.2'
OLLAMA_API = "http://localhost:11434/api/chat"

In [4]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
HEADERS = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=HEADERS)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [7]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [8]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [9]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2024/12/21/

In [None]:
from openai import OpenAI
import json

ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

def get_links(url):
    website = Website(url)
    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
             {"role": "system", "content": link_system_prompt},
             {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [27]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/agentica-org/DeepCoder-14B-Preview',
 '/HiDream-ai/HiDream-I1-Full',
 '/moonshotai/Kimi-VL-A3B-Thinking',
 '/deepseek-ai/DeepSeek-V3-0324',
 '/moonshotai/Kimi-VL-A3B-Instruct',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/jamesliu1217/EasyControl_Ghibli',
 '/spaces/bytedance-research/UNO-FLUX',
 '/spaces/Efficient-Large-Model/SanaSprint',
 '/spaces/HiDream-ai/HiDream-I1-Dev',
 '/spaces',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/openai/mrcr',
 '/datasets/agentica-org/DeepCoder-Preview-Dataset',
 '/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset',
 '/datasets/divaroffical/real_estate_ads',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google'

In [None]:
get_links("https://huggingface.co")

NameError: name 'get_links' is not defined

In [31]:
def get_all_details(url):
    result = "Landing page:\n"
    website = Website(url)
    result += website.get_contents()
    
    links = get_links(url)
    print("Found links:", links)
    
    # Check if 'links' key exists and is a list
    if isinstance(links, dict) and "links" in links and isinstance(links["links"], list):
        for link in links["links"]:
            # Add error handling for missing keys or failed requests
            try:
                if "url" in link and "type" in link:
                    result += f"\n\n{link['type']}:\n"
                    link_website = Website(link["url"])
                    result += link_website.get_contents()
                else:
                    print(f"Warning: Link missing required fields: {link}")
            except Exception as e:
                print(f"Error fetching content for {link.get('url', 'unknown URL')}: {str(e)}")
    else:
        print("Warning: No valid links found in response")
        
    return result

In [32]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'home page', 'url': ''}, {'type': 'dataset hub', 'url': 'https://huggingface.co/datasets'}, {'type': 'hub', 'url': 'https://huggingface.co/hub'}, {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co'}, {'type': 'Terms', ' url': ' https://terms_of_service.huggingface.co'}]}
Error fetching content for : Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error fetching content for https://blog.huggingface.co: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000164626C9A50>: Failed to resolve 'blog.huggingface.co' ([Errno 11001] getaddrinfo failed)"))
Landing page:
Webpage Title:
Hugging Face – The A

In [33]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [34]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [35]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About page', 'url': 'https://about.huggingface.co'}, {'type': 'Company page', 'url': 'https://huggingface.com/'}, {'type': 'Jobs/Careers', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter handle', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Error fetching content for https://about.huggingface.co: HTTPSConnectionPool(host='about.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000164623C2CD0>: Failed to resolve 'about.huggingface.co' ([Errno 11001] getaddrinfo failed)"))
Error fetching content for https://blog.huggingface.co: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameR

'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nagentica-org/DeepCoder-14B-Preview\nUpdated\n6 days ago\n•\n12.7k\n•\n538\nHiDream-ai/HiDream-I1-Full\nUpdated\n3 days ago\n•\n16k\n•\n464\nmoonshotai/Kimi-VL-A3B-Thinking\nUpdated\n1 day ago\n•\n10.2k\n•\n314\ndeepseek-ai/DeepSeek-V3-0324\nUpdated\n20 days ago\n•\n221k\n•\n2.61k\nmoonshotai/Kimi-VL-A3B-Instruct\nUpdated\nabout 22 hours ago\n•\n9.71k\n•\n156\nBrowse 1M+ models\nSpaces\nRunning

In [38]:
def create_brochure(company_name, url):
    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [39]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': ['https://huggingface.co/datasets', 'https://huggingface.co/docs', 'https://ui.endpoints.huggingface.co/chat', 'https://huggingface.co/about', 'https://huggingface.co/company', 'https://huggingface.co/careers', 'https://huggingface.co/blog', 'https://discuss.huggingface.co/', 'https://github.com/huggingface', 'https://twitter.com/huggingface', 'https://www.linkedin.com/company/huggingface/']}


# Hugging Face Brochure
## Introduction

Welcome to Hugging Face, the leading platform for the machine learning community. We're building the future of AI together.

## Our Mission

To empower a community of innovators who collaborate on models, datasets, and applications that drive meaningful impact in the world.

## What We Offer

* **Hugging Face Hub**: A collaborative platform where developers can share, discover, and utilize pre-trained models, datasets, and code snippets.
* **Spaces**: A managed environment for building, deploying, and versioning machine learning projects.
* **Compute**: Scalable infrastructure for running ML workloads, including GPU acceleration and optimized inference endpoints.
* **Datasets**: Access to a vast repository of high-quality datasets for various AI tasks, from image classification to natural language processing.

## Our Impact

* Over 50,000 organizations worldwide trust Hugging Face to power their machine learning initiatives.
* Industry leaders like Meta, Amazon, Google, Intel, Microsoft, and Grammarly have adopted our platform to drive innovation in AI research and production.

## Community Engagement

* Join the conversation on our forum and connect with the world's most talented ML researchers and practitioners.
* Participate in hackathons, challenges, and open-source initiatives that push the boundaries of what's possible with AI.

## Careers at Hugging Face

* We're looking for exceptional talent to join our team! Explore current openings, from software engineers to product managers, on our website.
* Work alongside pioneers in ML research and development on projects that will shape the future of AI.

## Our Values

* **Collaboration**: We believe in building a strong community where innovators share knowledge, resources, and expertise.
* **Innovation**: We're committed to fostering an environment where creativity and experimentation thrive.
* **Transparency**: Our platform is designed with open-source values at its core, ensuring maximum transparency and fairness in AI decision-making.

## Stay Ahead of the Curve

Hugging Face is leading the way in AI innovation. Join us today and shape the future of machine learning!

## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [40]:
def stream_brochure(company_name, url):
    stream = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [41]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.huggingface.co/brand'}, {'type': 'products page', 'url': 'https://ui.endpoints.huggingface.co(chat)'}, {'type': 'research and development pages', 'url': 'https://status.huggingface.co/'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'products page', 'url': 'https:// docs.huggingface.io/'}]}
Error fetching content for https://ui.endpoints.huggingface.co(chat): HTTPSConnectionPool(host='ui.endpoints.huggingface.co(chat)', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001646253A290>: Failed to resolve 'ui.endpoints.huggingface.co(chat)' ([Errno 11001] getaddrinfo failed)"))
Error fetching content for https:// docs.huggingface.io/: HTTPSConnectionPool(host='%20docs.hugging

**Hugging Face Brochure**
==========================

**Introduction**
---------------

Hugging Face is the AI community building the future. Our platform provides a collaboration space for machine learning enthusiasts to work on models, datasets, and applications. We offer a wide range of tools and resources to help accelerate your ML journey.

**Company Culture**
------------------

Our company culture values **collaboration**, **innovation**, and **community**. We believe that open-source AI tooling is the foundation of ML progress. Our community-driven approach encourages collaboration, sharing, and innovation.

**Mission**
----------

To create a more accessible AI platform for everyone, enabling users to easily discover, integrate, and deploy models without extensive technical expertise.

**Services**
------------

* **Hugging Face Platform**: Explore 1M+ pre-trained models, browse datasets, and use our Spaces collaboration tool.
* **Compute**: Deploy on optimized inference endpoints or update your Spaces applications with GPU acceleration in a few clicks.
* **Enterprise**: Get access to enterprise-grade security, access controls, dedicated support, and prioritized support.

**Success Stories**
-----------------

Over 50,000 organizations worldwide are using Hugging Face AI solutions. Our customers include well-known enterprises like Meta, Amazon, Google, Intel, Microsoft, and Grammarly.

**Transformation**
----------------

Our Open Source initiatives:

*   **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, JAX
*   **Diffusers**: State-of-the-art Diffusion models in PyTorch
*   **Safetensors**: Safe way to store/distribute neural network weights
*   **Hub**:
    *   Python client to interact with the Hugging Face Hub
    *   Fast tokenizers optimized for research & production

**Join Us**
-------------

If you're passionate about AI and want to build your portfolio, share your work, or contribute to open-source projects, we invite you to explore our [careers page](https://huggingface.co/jobs) or join the Hugging Face community at [Twitter](https://twitter.com/HuggingFace).

Visit [our website](https://huggingface.co/) now and start building with the future of AI!

In [42]:
# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:

stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co/'}, {'type': 'Company page', 'url': 'https://github.com/huggingface'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co/'}, {'type': 'FAQs', 'url': ''}, {'type': 'Careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Pricing page', 'url': 'https://pricing.huggingface.co/'}, {'type': 'Enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'Support resources', 'url': ''}]}
Error fetching content for https://blog.huggingface.co/: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000016462A4D710>: Failed to resolve 'blog.huggingface.co' ([Errno 11001] getaddrinfo failed)"))
Error fetching content for : Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error fetching content for https://pricing.huggingface.co/: HTTPSConnectionPool(host='

**Hugging Face: Empowering the Future of AI**

Welcome to Hugging Face, the premier platform for building, sharing, and collaborating on AI applications. Our mission is to bring machine learning closer to everyone, making it more accessible and intuitive.

**Our Story**

We believe that ML should be a force for good, driving innovation and positive impact worldwide. With a community-driven approach, we've created a robust ecosystem where developers can explore, create, and share their AI projects.

**Key Features**

* **1M+ Models**: Browse our vast library of pre-trained models for text, image, video, audio, or 3D applications.
* **Spaces**: Host, collaborate, and deploy AI models, datasets, and applications with ease.
* **Datasets**: Access and share datasets for any ML task, accelerating research and development.

**Community Highlights**

* **50000+ Organizations**: Leading enterprises, non-profits, and companies like Meta, Amazon, Google, Intel, Microsoft, and Grammarly rely on Hugging Face for AI innovation.
* **Leading Developers**: Follow your favorite developers, such as AI2, Enterprise, Non-PROfit, and top contributors to our ecosystem.

**Our Open Source**

We're committed to building the foundation of ML tooling with the community. Explore our open-source projects:
* **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, JAX.
* **Diffusers**: Cutting-edge diffusion models in PyTorch.
* **Safetensors**: Secure way to store/distribute neural network weights.

**Join Our Journey**

Get started today with our free sign-up and dive into the world of AI, where ideas come alive. Visit our [GitHub](https://github.com/huggingface), [Twitter](https://twitter.com/huggingface), or [LinkedIn](https://www.linkedin.com/company/hugging-face) for more insights.

At Hugging Face, we're building a future that's driven by collaboration, creativity, and innovation. Join us on this exciting journey!