In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import requests
from typing import List
from bs4 import BeautifulSoup
from IPython.display import display,Markdown,update_display
from anthropic import Anthropic

In [2]:
# Do this if you have api key
load_dotenv(override = True)
api_key = os.getenv('ANTHROPIC_API_KEY')
if api_key and api_key.startswith('sk-ant-'):
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'claude-3-7-sonnet-20250219'
client = Anthropic()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self,url):
        self.url = url
        response = requests.get(url,headers = headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else 'No title found'
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose
            self.text = soup.body.get_text(separator = '\n',strip = True)
        else:
            self.text = ""
        links = [link.get("href") for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(link_system_prompt+get_links_user_prompt(ed))

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com

In [9]:
def get_links(url):
    website = Website(url)
    response = client.messages.create(
        model = MODEL,
        system = link_system_prompt,
        max_tokens = 20000,
        messages=[
            {"role": "user", "content": get_links_user_prompt(website)}
        ]
    )
    result = response.content[0].text
    return result

In [10]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/HiDream-ai/HiDream-I1-Full',
 '/microsoft/bitnet-b1.58-2B-4T',
 '/agentica-org/DeepCoder-14B-Preview',
 '/moonshotai/Kimi-VL-A3B-Thinking',
 '/deepseek-ai/DeepSeek-V3-0324',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/bytedance-research/UNO-FLUX',
 '/spaces/jamesliu1217/EasyControl_Ghibli',
 '/spaces/HiDream-ai/HiDream-I1-Dev',
 '/spaces/Efficient-Large-Model/SanaSprint',
 '/spaces',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/openai/mrcr',
 '/datasets/zwhe99/DeepMath-103K',
 '/datasets/divaroffical/real_estate_ads',
 '/datasets/openai/graphwalks',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/W

In [11]:
a = get_links("https://huggingface.co").replace('json','').replace("```",'').replace("\n",'')

In [12]:
json.loads(a)

{'links': [{'type': 'homepage', 'url': 'https://huggingface.co/'},
  {'type': 'about company', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'brand info', 'url': 'https://huggingface.co/brand'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog', 'url': 'https://huggingface.co/blog'},
  {'type': 'learn resources', 'url': 'https://huggingface.co/learn'},
  {'type': 'documentation', 'url': 'https://huggingface.co/docs'},
  {'type': 'github', 'url': 'https://github.com/huggingface'},
  {'type': 'twitter', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [13]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    links = links.replace('json','').replace("```",'')
    print("Found links:\n",links)
    links = json.loads(links)
    for link in links['links']:
        result += f"\n\n{link['type']}\n"
        #print(result)
        result += Website(link["url"]).get_contents()
    return result

In [14]:
print(get_all_details("https://huggingface.co"))

Found links:
 
{
  "links": [
    {"type": "homepage", "url": "https://huggingface.co/"},
    {"type": "about page", "url": "https://huggingface.co/huggingface"},
    {"type": "pricing page", "url": "https://huggingface.co/pricing"},
    {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
    {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "blog", "url": "https://huggingface.co/blog"},
    {"type": "brand page", "url": "https://huggingface.co/brand"}
  ]
}

Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
HiDream-ai/HiDream-I1-Full
Updated
1 day ago
•
20.3k
•
553
microsoft/bitnet-b1.58-2B-4T
U

In [15]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [16]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [17]:
print(get_brochure_user_prompt("HuggingFace", "https://huggingface.co"))

Found links:
 
{
    "links": [
        {"type": "homepage", "url": "https://huggingface.co/"},
        {"type": "about page", "url": "https://huggingface.co/huggingface"},
        {"type": "pricing page", "url": "https://huggingface.co/pricing"},
        {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
        {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "blog", "url": "https://huggingface.co/blog"},
        {"type": "brand page", "url": "https://huggingface.co/brand"}
    ]
}

You are looking at a company called: HuggingFace
Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine le

In [35]:
def create_brochure(company_name,url):
    streamer = client.messages.create(
        model = MODEL,
        max_tokens = 64000,
        system = system_prompt,
        stream = True,
        messages = [{'role':'user','content':get_brochure_user_prompt(company_name,url)}]
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in streamer:
        print(chunk)
        response += chunk.delta.text or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [41]:
def create_brochure(company_name, url):
    streamer = client.messages.create(
        model=MODEL,
        max_tokens=64000,
        system=system_prompt,
        stream=True,
        messages=[{'role':'user','content':get_brochure_user_prompt(company_name, url)}]
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    
    for chunk in streamer:
        # Print chunk type for debugging
        chunk_type = type(chunk).__name__
        #print(f"Chunk type: {chunk_type}")
        
        # For RawContentBlockDeltaEvent
        if hasattr(chunk, 'delta') and hasattr(chunk.delta, 'text'):
            response += chunk.delta.text or ''
        # For RawContentBlockStartEvent
        elif hasattr(chunk, 'content_block') and hasattr(chunk.content_block, 'text'):
            response += chunk.content_block.text or ''
        
        # Skip other event types like RawMessageStartEvent
        
        # Update display if content was added
        cleaned_response = response.replace("```", "").replace("markdown", "")
        update_display(Markdown(cleaned_response), display_id=display_handle.display_id)

In [42]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links:
 
{
  "links": [
    {"type": "homepage", "url": "https://huggingface.co/"},
    {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
    {"type": "pricing page", "url": "https://huggingface.co/pricing"},
    {"type": "brand page", "url": "https://huggingface.co/brand"},
    {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "blog page", "url": "https://huggingface.co/blog"},
    {"type": "about page", "url": "https://huggingface.co/huggingface"}
  ]
}



# Hugging Face: Building the Future of AI Together

## The AI Community Platform

Hugging Face is the leading platform where the global machine learning community collaborates to build the future of AI. With over 1 million models, 250,000+ datasets, and 400,000+ applications, we're creating an ecosystem that empowers developers, researchers, and organizations to advance AI together.

## What We Offer

### For Developers & Researchers
- **Access to State-of-the-Art Models**: Browse and use over 1 million models across text, image, video, audio, and 3D modalities
- **Extensive Dataset Library**: Leverage 250,000+ datasets to train and fine-tune your models
- **Spaces**: Deploy and share AI applications with the community
- **Open Source Tools**: Our comprehensive suite of open-source libraries powers AI development worldwide:
  - Transformers (143,000+ stars)
  - Diffusers (28,600+ stars)
  - PEFT (18,100+ stars)
  - Datasets (19,900+ stars)
  - And many more

### For Organizations
- **Enterprise Solutions**: Enterprise-grade security, access controls, and dedicated support
- **Compute Infrastructure**: Deploy on optimized Inference Endpoints or upgrade Spaces applications to GPU in just a few clicks
- **Collaboration Tools**: Host and collaborate on unlimited public models, datasets, and applications

## Trusted by Leaders
More than 50,000 organizations rely on Hugging Face, including:
- AI at Meta
- Google
- Microsoft
- Amazon
- Intel
- Grammarly
- Writer
- And many more

## Pricing Options
- **Free Community Access**: Collaborate on unlimited public models, datasets, and applications
- **Compute Solutions**: Starting at $0.60/hour for GPU resources
- **Enterprise Plans**: Starting at $20/user/month with features like SSO, priority support, audit logs, and regional deployments

## Join the Community
Be part of the platform that's shaping the future of AI. Create, discover, and collaborate on machine learning projects. Build your portfolio by sharing your work with the world and establishing your ML profile.

---

*Hugging Face: The platform where the machine learning community collaborates on models, datasets, and applications.*