In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [8]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [9]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [10]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [12]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [13]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [14]:
stream_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Brochure

## Welcome to Hugging Face

**The AI Community Building the Future**  
Hugging Face is at the forefront of artificial intelligence and machine learning, operating as a dynamic platform where collaboration flourishes among developers, researchers, and organizations. From state-of-the-art models to expansive datasets, Hugging Face embraces innovation and community engagement to drive the future of AI.

---

## Our Offerings

### Models
- Access over **400k+ models** designed to meet various AI needs.
- Explore the latest updates, like **microsoft/phi-4** and **deepseek-ai/DeepSeek-V3**.

### Datasets
- Utilize a collection of **100k+ datasets** across diverse domains tailored for machine learning tasks.

### Spaces
- Discover innovative applications in our **Spaces** section, showcasing **150k+ applications** running continuously.

### Enterprise Solutions
- Dedicated support and robust enterprise-grade security for over **50,000 organizations**, including **Meta**, **Amazon**, and **Microsoft**.
- Affordable pricing starting at **$20/user/month**.

---

## Company Culture

At Hugging Face, we prioritize an open and collaborative environment. Our culture is built on the principles of inclusivity, innovation, and shared knowledge. By encouraging contributions from all members of our community, we create a vibrant space where ideas thrive and transformative projects come to life.

---

## Our Customers

We are proud to serve a wide array of organizations, including:

- **AI at Meta**
- **Amazon Web Services**
- **Google**
- **Intel**
- **Grammarly**

From non-profits to Fortune 500 companies, our versatile solutions cater to diverse needs within the AI and machine learning spectrum.

---

## Careers at Hugging Face

Join us in shaping the future of AI! We’re on the lookout for passionate individuals eager to contribute to a collaborative and innovative workspace. Our current job openings span various roles, and we value creativity, commitment, and the drive for excellence.

Explore career opportunities on our [Jobs Page](https://huggingface.co/jobs).

---

## Get Involved

**Join the Movement!**  
Whether you're looking to collaborate on models, access datasets, or engage with our community, Hugging Face is the place to be. 

- [Sign Up](https://huggingface.co/signup)
- **Follow us on social media:** [GitHub](https://github.com/huggingface) | [Twitter](https://twitter.com/huggingface) | [LinkedIn](https://www.linkedin.com/company/huggingface)

Let’s build the future of AI together! 🌟

--- 

For more information, visit us at [Hugging Face](https://huggingface.co).