In [35]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import requests
from typing import List
from bs4 import BeautifulSoup
from IPython.display import display,Markdown,update_display
from anthropic import Anthropic
import gradio as gr

In [2]:
# Do this if you have api key
load_dotenv(override = True)
api_key = os.getenv('ANTHROPIC_API_KEY')
if api_key and api_key.startswith('sk-ant-'):
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'claude-3-7-sonnet-20250219'
client = Anthropic()

API key looks good so far


In [21]:
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
if deepseek_api_key:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

API key looks good so far


In [22]:
deepseek_client = OpenAI(base_url = "https://api.deepseek.com",api_key = deepseek_api_key)
llama_client = OpenAI(base_url = 'http://localhost:11434/v1', api_key='ollama')

In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self,url):
        self.url = url
        response = requests.get(url,headers = headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else 'No title found'
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose
            self.text = soup.body.get_text(separator = '\n',strip = True)
        else:
            self.text = ""
        links = [link.get("href") for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(link_system_prompt+get_links_user_prompt(ed))

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com

In [9]:
def get_links(url):
    website = Website(url)
    response = client.messages.create(
        model = MODEL,
        system = link_system_prompt,
        max_tokens = 20000,
        messages=[
            {"role": "user", "content": get_links_user_prompt(website)}
        ]
    )
    result = response.content[0].text
    return result

In [10]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 'blog/inference-providers-cohere',
 '/spaces',
 '/models',
 '/microsoft/bitnet-b1.58-2B-4T',
 '/HiDream-ai/HiDream-I1-Full',
 '/agentica-org/DeepCoder-14B-Preview',
 '/moonshotai/Kimi-VL-A3B-Thinking',
 '/THUDM/GLM-4-32B-0414',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/bytedance-research/UNO-FLUX',
 '/spaces/jamesliu1217/EasyControl_Ghibli',
 '/spaces/HiDream-ai/HiDream-I1-Dev',
 '/spaces/Efficient-Large-Model/SanaSprint',
 '/spaces',
 '/datasets/openai/mrcr',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/zwhe99/DeepMath-103K',
 '/datasets/divaroffical/real_estate_ads',
 '/datasets/openai/graphwalks',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/mic

In [11]:
a = get_links("https://huggingface.co").replace('json','').replace("```",'').replace("\n",'')

In [12]:
json.loads(a)

{'links': [{'type': 'homepage', 'url': 'https://huggingface.co/'},
  {'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'brand page', 'url': 'https://huggingface.co/brand'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog', 'url': 'https://huggingface.co/blog'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'documentation', 'url': 'https://huggingface.co/docs'},
  {'type': 'learning resources', 'url': 'https://huggingface.co/learn'},
  {'type': 'github', 'url': 'https://github.com/huggingface'},
  {'type': 'twitter', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin', 'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'status page', 'url': 'https://status.huggingface.co/'}]}

In [13]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    links = links.replace('json','').replace("```",'')
    print("Found links:\n",links)
    links = json.loads(links)
    for link in links['links']:
        result += f"\n\n{link['type']}\n"
        #print(result)
        result += Website(link["url"]).get_contents()
    return result

In [14]:
print(get_all_details("https://huggingface.co"))

Found links:
 
{
  "links": [
    {"type": "homepage", "url": "https://huggingface.co/"},
    {"type": "about company", "url": "https://huggingface.co/huggingface"},
    {"type": "products/services", "url": "https://huggingface.co/models"},
    {"type": "products/services", "url": "https://huggingface.co/datasets"},
    {"type": "products/services", "url": "https://huggingface.co/spaces"},
    {"type": "products/services", "url": "https://huggingface.co/enterprise"},
    {"type": "pricing", "url": "https://huggingface.co/pricing"},
    {"type": "documentation", "url": "https://huggingface.co/docs"},
    {"type": "blog", "url": "https://huggingface.co/blog"},
    {"type": "careers", "url": "https://apply.workable.com/huggingface/"},
    {"type": "brand information", "url": "https://huggingface.co/brand"},
    {"type": "social media", "url": "https://github.com/huggingface"},
    {"type": "social media", "url": "https://twitter.com/huggingface"},
    {"type": "social media", "url": "http

In [15]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [16]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [17]:
print(get_brochure_user_prompt("HuggingFace", "https://huggingface.co"))

Found links:
 
{
  "links": [
    {"type": "homepage", "url": "https://huggingface.co/"},
    {"type": "about page", "url": "https://huggingface.co/huggingface"},
    {"type": "products page", "url": "https://huggingface.co/models"},
    {"type": "products page", "url": "https://huggingface.co/datasets"},
    {"type": "products page", "url": "https://huggingface.co/spaces"},
    {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
    {"type": "pricing page", "url": "https://huggingface.co/pricing"},
    {"type": "documentation", "url": "https://huggingface.co/docs"},
    {"type": "blog", "url": "https://huggingface.co/blog"},
    {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "brand page", "url": "https://huggingface.co/brand"},
    {"type": "social media", "url": "https://github.com/huggingface"},
    {"type": "social media", "url": "https://twitter.com/huggingface"},
    {"type": "social media", "url": "https://www.linked

In [18]:
def create_brochure(company_name,url):
    streamer = client.messages.create(
        model = MODEL,
        max_tokens = 64000,
        system = system_prompt,
        stream = True,
        messages = [{'role':'user','content':get_brochure_user_prompt(company_name,url)}]
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in streamer:
        print(chunk)
        response += chunk.delta.text or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [19]:
def create_brochure(company_name, url):
    streamer = client.messages.create(
        model=MODEL,
        max_tokens=64000,
        system=system_prompt,
        stream=True,
        messages=[{'role':'user','content':get_brochure_user_prompt(company_name, url)}]
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    
    for chunk in streamer:
        # Print chunk type for debugging
        chunk_type = type(chunk).__name__
        #print(f"Chunk type: {chunk_type}")
        
        # For RawContentBlockDeltaEvent
        if hasattr(chunk, 'delta') and hasattr(chunk.delta, 'text'):
            response += chunk.delta.text or ''
        # For RawContentBlockStartEvent
        elif hasattr(chunk, 'content_block') and hasattr(chunk.content_block, 'text'):
            response += chunk.content_block.text or ''
        
        # Skip other event types like RawMessageStartEvent
        
        # Update display if content was added
        cleaned_response = response.replace("```", "").replace("markdown", "")
        update_display(Markdown(cleaned_response), display_id=display_handle.display_id)

In [20]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links:
 
{
    "links": [
        {"type": "homepage", "url": "https://huggingface.co/"},
        {"type": "about company", "url": "https://huggingface.co/huggingface"},
        {"type": "products", "url": "https://huggingface.co/models"},
        {"type": "products", "url": "https://huggingface.co/datasets"},
        {"type": "products", "url": "https://huggingface.co/spaces"},
        {"type": "enterprise", "url": "https://huggingface.co/enterprise"},
        {"type": "pricing", "url": "https://huggingface.co/pricing"},
        {"type": "blog", "url": "https://huggingface.co/blog"},
        {"type": "careers", "url": "https://apply.workable.com/huggingface/"},
        {"type": "brand", "url": "https://huggingface.co/brand"},
        {"type": "documentation", "url": "https://huggingface.co/docs"},
        {"type": "social media", "url": "https://github.com/huggingface"},
        {"type": "social media", "url": "https://twitter.com/huggingface"},
        {"type": "social media", 

# Hugging Face: Building the Future of AI Together

## The AI Community Platform

Hugging Face is the leading collaboration platform where the machine learning community builds, shares, and deploys AI models, datasets, and applications. With over 1 million models, 250,000+ datasets, and 400,000+ applications, we're creating an ecosystem that democratizes artificial intelligence.

![Hugging Face Logo](https://huggingface.co/front/assets/huggingface_logo.svg)

## What We Offer

### For Developers & Researchers
- **Open-Source Tools**: Access our extensive library of ML tools including Transformers, Diffusers, Tokenizers, and more
- **Collaborative Environment**: Host unlimited public models, datasets, and applications
- **Multi-Modal Support**: Work with text, image, video, audio, and 3D models
- **Build Your Portfolio**: Showcase your work and build your ML profile within the community

### For Enterprises
- **Enterprise Solutions**: Secure, scalable infrastructure with access controls
- **Compute Resources**: Deploy on optimized Inference Endpoints
- **GPU Acceleration**: Scale your Spaces applications with GPU support
- **Advanced Security**: Single Sign-On, audit logs, resource groups, and private datasets
- **Priority Support**: Dedicated assistance for your team's needs

## Trusted by Industry Leaders

Over 50,000 organizations rely on Hugging Face, including:
- Meta AI
- Google
- Microsoft
- Amazon
- Intel
- Grammarly
- Writer

## Our Open Source Foundation

We're committed to building the future of ML tooling with our community:

- **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, and JAX (143,000+ stars)
- **Diffusers**: Advanced diffusion models in PyTorch (28,600+ stars)
- **PEFT**: Parameter-efficient fine-tuning for large language models (18,100+ stars)
- **Datasets**: Access & share datasets for any ML tasks (19,900+ stars)
- **Transformers.js**: ML running directly in your browser (13,400+ stars)

## Join Our Community

Whether you're a researcher, developer, enterprise, or AI enthusiast, there's a place for you in our community. Create, discover, and collaborate on machine learning projects better than ever before.

**Start Building the Future of AI Today**

[Sign Up for Free](https://huggingface.co/join) | [Explore Enterprise Solutions](https://huggingface.co/enterprise)

*Pricing starts at $0.60/hour for GPU Compute and $20/user/month for Enterprise features*

---

*For career opportunities, visit our [Jobs page](https://huggingface.co/jobs) and become part of the team shaping the future of AI.*

## Trying Gradio

In [48]:
def stream_llama(company_name, url):
    messages = [
        {'role':'system','content':system_prompt},
        {'role':'user','content':get_brochure_user_prompt(company_name, url)}
    ]
    response = llama_client.chat.completions.create(
        model = 'llama3.2',
        messages = messages,
        temperature = 0,
        stream = True
    )
    reply = ""
    for chunk in response:
        reply += chunk.choices[0].delta.content or ''
        yield reply

In [44]:
def stream_claude(company_name, url):
    messages = [
        {'role':'user','content':get_brochure_user_prompt(company_name, url)}
    ]
    response = client.messages.stream(
        model = MODEL,
        max_tokens = 20000,
        messages = messages,
        temperature = 0,
        system = system_prompt
    )
    reply = ""
    with response as stream:
        for text in stream.text_stream:
            reply += text or ''
            yield reply

In [45]:
def stream_deepseek(company_name, url):
    messages = [
        {'role':'system','content':system_prompt},
        {'role':'user','content':get_brochure_user_prompt(company_name, url)}
    ]
    response = deepseek_client.chat.completions.create(
        model = 'deepseek-chat',
        messages = messages,
        temperature = 0,
        stream = True
    )
    reply = ""
    for chunk in response:
        reply += chunk.choices[0].delta.content or ''
        yield reply

In [46]:
def stream_brochure(company_name, url, model):
    if model=="Deepseek":
        result = stream_deepseek(company_name, url)
    elif model=="Llama":
        result = stream_llama(company_name, url)
    elif model=="Claude":
        result = stream_claude(company_name, url)
    else:
        raise ValueError("Unknown model")
    yield from result

In [49]:
view = gr.Interface(
    fn=stream_brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(["Deepseek", "Llama","Claude"], label="Select model")],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)
view.launch(share = True)

* Running on local URL:  http://127.0.0.1:7885
* Running on public URL: https://b36e6dac76d9b6429d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Found links:
 
{
    "links": [
        {"type": "homepage", "url": "https://huggingface.co/"},
        {"type": "about page", "url": "https://huggingface.co/huggingface"},
        {"type": "enterprise page", "url": "https://huggingface.co/enterprise"},
        {"type": "pricing page", "url": "https://huggingface.co/pricing"},
        {"type": "blog", "url": "https://huggingface.co/blog"},
        {"type": "docs", "url": "https://huggingface.co/docs"},
        {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "product - models", "url": "https://huggingface.co/models"},
        {"type": "product - datasets", "url": "https://huggingface.co/datasets"},
        {"type": "product - spaces", "url": "https://huggingface.co/spaces"},
        {"type": "brand page", "url": "https://huggingface.co/brand"},
        {"type": "community forum", "url": "https://discuss.huggingface.co"},
        {"type": "learn page", "url": "https://huggingface.co/learn"}
  