<a href="https://colab.research.google.com/github/Gabrielnkl/notebooks/blob/main/openai_brochure_prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# from exercise: https://github.com/ed-donner/llm_engineering/blob/main/week1/day5.ipynb

In [2]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [3]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI


In [5]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [7]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpag"

In [8]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [9]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [10]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [11]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/mistralai/Magistral-Small-2506',
 '/openbmb/MiniCPM4-8B',
 '/deepseek-ai/DeepSeek-R1-0528',
 '/Qwen/Qwen3-Embedding-0.6B-GGUF',
 '/echo840/MonkeyOCR',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/ResembleAI/Chatterbox',
 '/spaces/multimodalart/wan2-1-fast',
 '/spaces/aisheets/sheets',
 '/spaces/NihalGazi/Text-To-Speech-Unlimited',
 '/spaces',
 '/datasets/nvidia/Nemotron-Personas',
 '/datasets/open-thoughts/OpenThoughts3-1.2M',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/a-m-team/AM-DeepSeek-R1-0528-Distilled',
 '/datasets/institutional/institutional-books-1.0',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',


In [12]:
get_links("https://huggingface.co")


{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'discussion page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}

In [14]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [15]:
print(get_all_details("https://huggingface.co"))


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}]}
Landing page:
Webpag

about page
Webpag

careers page
Webpag

enterprise page
Webpag

pricing page
Webpag

blog page
Webpag

docs page
Webpag


In [16]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


In [17]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [18]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface'}, {'type': 'Twitter', 'url': 'https://twitter.com/huggingface'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpag\n\nabout page\nWebpag\n\ncareers page\nWebpag\n\nenterprise page\nWebpag\n\npricing page\nWebpag\n\nblog\nWebpag\n\ndiscussion forum\nWebpag\n\nGitHub\nWebpag\n\nLinkedIn\nWebpag\n\nTwitter\nWebpag'

In [19]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [20]:
create_brochure("HuggingFace", "https://huggingface.co")


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}]}


# Hugging Face Brochure

## Welcome to Hugging Face

Hugging Face is at the forefront of the AI and machine learning revolution. Our mission is to democratize AI and make it accessible for everyone through cutting-edge technology, open-source projects, and a robust community of developers and researchers.

### Company Overview

- **Founded:** [Year]
- **Headquarters:** [Location]
- **Core Focus:** Natural Language Processing (NLP), Machine Learning Models, AI Tools

### Company Culture

At Hugging Face, we foster a culture of collaboration, inclusivity, and innovation. Our team is passionate about pushing the boundaries of what AI can achieve. Here are some cultural highlights:

- **Open Culture:** Encouragement of open communication and idea sharing.
- **Community First:** Collaboration with developers, researchers, and educators globally.
- **Continuous Learning:** We value personal and professional development, offering resources and support.
- **Diversity and Inclusion:** Hugging Face actively promotes a diverse workforce to drive creativity and innovation.

### Customers

Our users range from individual developers and researchers to large corporations looking to integrate AI solutions into their workflows. Hugging Face is trusted by:

- Startups looking to leverage AI for innovation
- Enterprises integrating AI for efficiency and scalability
- Researchers pioneering new technology in machine learning

### Careers at Hugging Face

Join our dynamic team and become part of a groundbreaking journey in AI! We are on the lookout for passionate individuals in various roles, including:

- **Software Engineers** specializing in NLP and machine learning
- **Product Managers** to spearhead the development of our AI products
- **Data Scientists** to analyze and interpret complex data
- **Community Managers** to engage and support our vibrant user community

**Why Work Here?**
- **Impactful Projects:** Work on projects that change the landscape of AI.
- **Flexible Work Environment:** We offer a hybrid and remote work policy to suit your lifestyle.
- **Great Benefits:** Comprehensive health plans, professional development funds, and more!

### Get Started 

Explore our diverse range of AI tools by checking out our [Enterprise Solutions](Link to enterprise page) or learn more about [Pricing](Link to pricing page) to find what fits your needs. Dive into a wealth of knowledge on our [Blog](Link to blog page) or explore in-depth topics in our [Docs](Link to docs page).

### Let's Connect!

Join us in revolutionizing the AI space! For inquiries, please visit our [Contact Page](Link to contact page).

---

For the latest updates, follow us on social media and subscribe to our newsletter. Thank you for considering Hugging Face as your partner in AI innovation!