In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-

## First exercise: Multi-Shot prompting 

In [5]:
def get_system_prompt():
    link_system_prompt = "You are provided with a list of links found on a webpage. \
    You are able to decide which of the links would be most relevant to include in a brochure about the company, \
    such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
    link_system_prompt += "You should respond in JSON as in this example:"
    link_system_prompt += """
    EXAMPLE 1
    {
        "links": [
            {"type": "about page", "url": "https://full.url/goes/here/about"},
            {"type": "careers page": "url": "https://another.full.url/careers"}
        ]
    }
    """
    link_system_prompt += """
    EXAMPLE 2
    {
        "links": [
            {"type": "product page", "url": "https://full.url/goes/here/product"},
            {"type": "contact page": "url": "https://full.url/goes/here/contact"}
        ]
    }
    """
    return link_system_prompt

In [6]:
print(get_system_prompt())

You are provided with a list of links found on a webpage.     You are able to decide which of the links would be most relevant to include in a brochure about the company,     such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
    EXAMPLE 1
    {
        "links": [
            {"type": "about page", "url": "https://full.url/goes/here/about"},
            {"type": "careers page": "url": "https://another.full.url/careers"}
        ]
    }
    
    EXAMPLE 2
    {
        "links": [
            {"type": "product page", "url": "https://full.url/goes/here/product"},
            {"type": "contact page": "url": "https://full.url/goes/here/contact"}
        ]
    }
    


In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/01/23/ll

In [9]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": get_system_prompt()},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [14]:
medium = Website("https://huggingface.co")
medium.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nari-labs/Dia-1.6B',
 '/sand-ai/MAGI-1',
 '/microsoft/bitnet-b1.58-2B-4T',
 '/ostris/Flex.2-preview',
 '/HiDream-ai/HiDream-I1-Full',
 '/models',
 '/spaces/nari-labs/Dia-1.6B',
 '/spaces/enzostvs/deepsite',
 '/spaces/InstantX/InstantCharacter',
 '/spaces/bytedance-research/UNO-FLUX',
 '/spaces/Kwai-Kolors/Kolors-Virtual-Try-On',
 '/spaces',
 '/datasets/Anthropic/values-in-the-wild',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/zwhe99/DeepMath-103K',
 '/datasets/OpenGVLab/InternVL-Data',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '

In [13]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'documentation page', 'url': 'https://huggingface.co/docs'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'status page', 'url': 'https://status.huggingface.co/'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'learn page', 'url': 'https://huggingface.co/learn'}]}

## Second exercise: Provide Brochure in a particular format, with specified sections

In [15]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [16]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'contact page', 'url': 'https://huggingface.co/chat'}, {'type': 'community discussion page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future

In [24]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in italic font and big characters.\
Include only information company, other things don't matter"

In [25]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [26]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'company page', 'url': 'https://twitter.com/huggingface'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnari-labs/Dia-1.6B\nUpdated\nabout 23 hours ago\n•\n50.5k\n•\n1.15k\nsand-ai/MAGI-1\nUpdated\n1 day ago\n•\n387\nmicrosoft/bitnet-b1.58-2B-4T\nUpdated\n3 days ago\n•\n29k\n•\n794\nostris/Flex.2-preview\nUpdated\nabout 15 hours ago\n•\n3.19k\n•\n192\nHiDream-ai/HiDream-I1-Full\nUpdated\n4 days ago\n•\n30.5k\n•\n746\nBrowse 1M+ models\nSpaces\nRunning\non\nZero\n680\n680\nDia 1.6B\n👯\nGenerate realistic dia

In [30]:
def create_brochure(company_name, url, brochure):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": brochure}
          ],
    )
    result = response.choices[0].message.content
    #display(result)
    display(Markdown(result))

In [31]:
create_brochure("HuggingFace", "https://huggingface.co", get_brochure_user_prompt("HuggingFace", "https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'contact page', 'url': 'https://huggingface.co/discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


*HUGGING FACE*

*THE AI COMMUNITY BUILDING THE FUTURE*

*Hugging Face is at the forefront of the AI and machine learning revolution, offering a collaborative platform where developers and researchers can create, share, and build upon a wide array of machine learning models and datasets. With over 1 million models and 250,000 datasets, our community thrives on innovation and open-source computing.*

*KEY OFFERINGS:*

- *Models:* Access and collaborate on a diverse library of ML models.
  
- *Datasets:* Browse and utilize extensive datasets to refine your machine learning projects.
  
- *Spaces:* Discover applications running AI models in real-time, facilitating practical implementation.*
  
- *Enterprise Solutions:* Enhance your AI projects with enterprise-grade support, security, and dedicated tools.*

*THE POWER OF COMMUNITY:*

*Join a vibrant community of over 50,000 organizations, including tech giants like Google, Amazon, and Microsoft, dedicated to advancing AI technologies.*

*OUR TECHNOLOGIES INCLUDE:*

- *Transformers:* State-of-the-art ML for popular frameworks like PyTorch and TensorFlow.
  
- *Diffusers, Tokenizers, and Accelerate:* Tools to optimize model performance and training efficiency.*

*PARTNER WITH US:*

*Explore our paid compute offerings starting at $0.60/hour and enterprise solutions beginning at $20/user/month, designed to accelerate your AI journey.*

*FOLLOW US:*

*Stay connected through our social channels and contribute to the future of AI at Hugging Face.*

*Start building the future today—join the Hugging Face community!*

## Third Exercise: Translate the entire brochure to Spanish

In [32]:
def get_brochure_user_prompt_spanish(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in spanish.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [33]:
create_brochure("HuggingFace", "https://huggingface.co", get_brochure_user_prompt_spanish("HuggingFace", "https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


*HUGGING FACE*

*LA COMUNIDAD DE IA QUE CONSTRUYE EL FUTURO*

---

*Hugging Face es una plataforma innovadora que fomenta la colaboración dentro de la comunidad de aprendizaje automático. Aquí, los innovadores pueden explorar, compartir y desarrollar más de un millón de modelos, aplicaciones y conjuntos de datos para revolucionar el sector de la inteligencia artificial.*

---

*¿QUÉ OFRECEMOS?*

- **Modelos**: Acceso a más de 1 millón de modelos de aprendizaje automático para diversos usos.
  
- **Conjuntos de datos**: Una amplia colección de más de 250,000 conjuntos de datos para impulsar su investigación y proyectos.

- **Espacios**: Implementación de aplicaciones de inteligencia artificial que permiten a los usuarios experimentar y personalizar soluciones.

---

*SOLUCIONES PARA EMPRESAS*

- Hugging Face ofrece soluciones de *Compute* y opciones *Enterprise* diseñadas específicamente para ayudar a los equipos a construir inteligencia artificial con seguridad y soporte dedicados.
  
- Comenzando desde $20 por usuario al mes, nuestra plataforma avanzada proporciona controles de acceso y herramientas de colaboración que facilitan un entorno de trabajo eficaz para más de 50,000 organizaciones, incluidas empresas líderes como Google, Microsoft y Amazon.

---

*OPEN SOURCE*

- Con un fuerte compromiso hacia el código abierto, Hugging Face desarrolla herramientas fundamentales para el aprendizaje automático, incluyendo:
  - Transformers
  - Diffusers
  - Tokenizers

---

*ÚNETE A NUESTRA COMUNIDAD*

- Ya sea que seas un investigador, desarrollador o una organización, en Hugging Face encontrarás las herramientas y el espacio para impulsar tu innovación en inteligencia artificial.

---

*VISÍTANOS EN:*

*[https://huggingface.co](https://huggingface.co)* 

*¡Nosotros estamos construyendo el futuro de la IA, únete a nosotros!*