In [9]:
import os
from dotenv import load_dotenv
from IPython.display import display, Markdown, update_display
from openai import OpenAI
import json
from typing import List
from bs4 import BeautifulSoup
import requests
from openai import OpenAI

In [10]:
load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")

In [25]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


class Website:

    def __init__(self, url: str):
        self.url = url
        response = requests.get(url,headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get("href") for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

        

In [13]:
# one shot prompting
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [14]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [16]:
openai = OpenAI()
MODEL = 'gpt-4o-mini'

def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [21]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/openai/gpt-oss-120b',
 '/openai/gpt-oss-20b',
 '/Qwen/Qwen-Image',
 '/zai-org/GLM-4.5V',
 '/rednote-hilab/dots.ocr',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Qwen/Qwen-Image',
 '/spaces/amd/gpt-oss-120b-chatbot',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/black-forest-labs/FLUX.1-Krea-dev',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/jxm/gpt-oss20b-samples',
 '/datasets/nvidia/Llama-Nemotron-VLM-Dataset-v1',
 '/datasets/allenai/WildChat-4.8M',
 '/datasets/openai/BrowseCompLongContext',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/d

In [22]:
get_links("https://huggingface.co")


{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'company page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'models page', 'url': 'https://huggingface.co/models'},
  {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'},
  {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}]}

In [33]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [34]:
print(get_all_details("https://huggingface.co"))

Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
openai/gpt-oss-120b
Updated
about 5 hours ago
•
490k
•
3.3k
openai/gpt-oss-20b
Updated
about 5 hours ago
•
2.37M
•
2.89k
Qwen/Qwen-Image
Updated
8 days ago
•
69.7k
•
1.57k
zai-org/GLM-4.5V
Updated
2 days ago
•
622
•
381
rednote-hilab/dots.ocr
Updated
2 days ago
•
17.9k
•
671
Browse 1M+ models
Spaces
Running
11.8k
11.8k
DeepSite v2
🐳
Generate any application with DeepSeek
Running
on
Zero
537
537
Qwen Image
🖼
Generate images from text prompts
Running
on
CPU Upgrade
166
166
GPT-OSS-120B on AMD MI300X
💻
gpt-oss-120b model running on AMD MI300 infrastructure.
Running
on
Zero
MCP
87
87
Wan2.2 14B Fast
🎥
gene

In [35]:
# new prompts for brochure generation
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt


In [36]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nopenai/gpt-oss-120b\nUpdated\nabout 5 hours ago\n•\n490k\n•\n3.3k\nopenai/gpt-oss-20b\nUpdated\nabout 5 hours ago\n•\n2.37M\n•\n2.89k\nQwen/Qwen-Image\nUpdated\n8 days ago\n•\n69.7k\n•\n1.57k\nzai-org/GLM-4.5V\nUpdated\n2 days ago\n•\n622\n•\n381\nrednote-hilab/dots.ocr\nUpdated\n2 days ago\n•\n17.9k\n•\n671\nBrowse 1M+ models\nSpaces\nRunning\n11.8k\n11.8k\nDeepSite v2\n🐳\nGenerate any ap

In [37]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [38]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face: Building the Future of AI

### About Us
**Hugging Face** is at the forefront of the AI community, dedicated to fostering collaboration and innovation in machine learning. We provide a platform where enthusiasts, researchers, and organizations can come together to build and share models, datasets, and applications.

Our mission is to make AI accessible and impactful by creating tools and resources that empower the community. With an extensive library of over **1 million models** and **250,000 datasets**, we are transforming how machine learning is developed and utilized.

### Company Culture
At Hugging Face, we cherish a culture deeply rooted in community, openness, and collaboration. We view ourselves not just as a company, but as a vibrant ecosystem where contributors of all backgrounds can thrive. Our work environment encourages creativity, continuous learning, and the sharing of knowledge, driving us towards groundbreaking innovations.

### Our Customers
More than **50,000 organizations** have chosen Hugging Face as their AI partner, including industry leaders such as:
- **Meta**
- **Google**
- **Amazon**
- **Microsoft**
- **Grammarly**

These enterprises leverage our cutting-edge models and secure, scalable solutions to enhance their AI capabilities and deliver value to their customers.

### Careers at Hugging Face
We are always looking for passionate and talented individuals to join our team. If you want to contribute to the future of AI and be part of an inclusive community, consider exploring our **current openings**. Here are some perks of working with us:
- Flexible work arrangements
- Opportunities for professional growth
- A collaborative and supportive team environment
- A chance to work on state-of-the-art AI projects

### Collaboration & Resources
Hugging Face is more than just a platform; it’s a place for anyone interested in machine learning to contribute, explore, and grow. We offer:
- **Models**: Access to state-of-the-art models across various modalities including text, image, and audio.
- **Datasets**: A repository of datasets catering to diverse ML tasks.
- **Spaces**: An environment to run and showcase your ML applications easily.

### Get Started Today
Join us in shaping the future of AI! Explore our resources, contribute your work, and connect with a global community of innovators. Whether you’re a developer, a researcher, or someone curious about AI, **Hugging Face** welcomes you.

- [Visit Our Website](https://huggingface.co)
- [Explore Models](https://huggingface.co/models)
- [Join Our Community](https://huggingface.co/community)

Embrace the future of AI with Hugging Face!

In [43]:
#this will generate the output in stream
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [44]:
stream_brochure("HuggingFace", "https://huggingface.co")


# Hugging Face Brochure

## Who We Are
Welcome to **Hugging Face**, an AI community that is dedicated to building the future of machine learning. Our platform serves as a collaborative hub where developers, researchers, and enthusiasts can share models, datasets, and applications, making it a go-to place for anyone involved in machine learning.

## Mission
Our mission is to open-source the AI ecosystem and make machine learning accessible to everyone. With over **1 million models** and countless datasets, we empower the community to create, discover, and collaborate better on machine learning projects.

## Services We Offer
- **Models**: Access 1M+ models, including state-of-the-art architectures like GPT-OSS.
- **Datasets**: Browse through 250k+ datasets tailored for various machine learning tasks.
- **Spaces**: Host and collaborate on unlimited public applications, enabling rapid development and deployment.
- **Compute**: Leverage our optimized GPU solutions starting at just **$0.60/hour**.
- **Enterprise Solutions**: Tailored options for organizations, ensuring advanced security and dedicated support.

## Our Customers
More than **50,000 organizations** rely on Hugging Face, including enterprises like:
- **Meta**
- **Amazon**
- **Google**
- **Microsoft**
- **Intel**

## Community and Culture
At Hugging Face, we believe in **collaboration, transparency, and support**. Our vibrant community is driven by a shared passion for AI and machine learning, offering a welcoming environment for users of all skill levels. We value contributions from everyone and are committed to building an inclusive, innovative, and engaged community.

## Careers at Hugging Face
Join our growing team! We are always looking for talented individuals who are passionate about AI and machine learning. Check our [Jobs Page](https://huggingface.co/jobs) for current opportunities to shape the future of AI with us.

### Current Open Positions:
- AI Researchers
- Machine Learning Engineers
- Community Managers

## Join Us
Be part of the revolution in AI and machine learning. Whether you're a customer, investor, or potential recruit, **Hugging Face** welcomes you to explore the endless possibilities of technology and innovation.

**Sign Up Today**: [Hugging Face](https://huggingface.co)

---

For more information, visit our website or follow us on [GitHub](https://github.com/huggingface) and [Twitter](https://twitter.com/huggingface).

