In [23]:
# imports
import os
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [24]:
# constants
MODEL = "llama3.2:latest"

In [25]:
exclude = [
    "join.discord.com"
]

In [26]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    "A utility class to represent a Website that we have scraped, now with links"

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator='\n', strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link not in exclude]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\n\nWebpage Contents:\n{self.text}"

In [27]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [28]:
# first step: have llm figure out which links are relevant
link_system_prompt = "You are provider with a list of links found on a webpage. You are able to decide which of the links would be more relevant to include in a brochure about the company, such as links to an About page, or a Company page, or a Company page, or Careers/Jobs pages. \n"
link_system_prompt += "You should respond in JSON as in the example:"
link_system_prompt += """
{
  "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://full.url/goes/here/careers"}
  ]
}
"""

In [29]:
print(link_system_prompt)

You are provider with a list of links found on a webpage. You are able to decide which of the links would be more relevant to include in a brochure about the company, such as links to an About page, or a Company page, or a Company page, or Careers/Jobs pages. 
You should respond in JSON as in the example:
{
  "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://full.url/goes/here/careers"}
  ]
}



In [30]:
def get_links_user_prompt(website):
    user_prompt= f"Here is the list on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [31]:
print(get_links_user_prompt(ed))

Here is the list on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2024/12/21/llm-resou

In [32]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL, messages=[{"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
        ], format="json")
    result = response.message.content
    return json.loads(result)

In [33]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/Qwen/QwQ-32B',
 '/sesame/csm-1b',
 '/google/gemma-3-27b-it',
 '/deepseek-ai/DeepSeek-R1',
 '/SparkAudio/Spark-TTS-0.5B',
 '/models',
 '/spaces/sesame/csm-1b',
 '/spaces/Wan-AI/Wan2.1',
 '/spaces/Qwen/QwQ-32B-Demo',
 '/spaces/ASLP-lab/DiffRhythm',
 '/spaces/nanotron/ultrascale-playbook',
 '/spaces',
 '/datasets/FreedomIntelligence/medical-o1-reasoning-SFT',
 '/datasets/facebook/natural_reasoning',
 '/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k',
 '/datasets/open-r1/codeforces-cots',
 '/datasets/gaia-benchmark/GAIA',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs

In [34]:
get_links(huggingface.url)

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/'},
  {'type': 'company overview', 'url': 'https://huggingface.co/'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}]}

## Second step: make the brohcure!

In [35]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [36]:
print(get_all_details(huggingface.url))

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co/'}, {'type': 'Company page', 'url': 'https://ui.endpoints.huggingface.co/chat'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Blog page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.

Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
Qwen/QwQ-32B
Updated
3 days ago
•
296k
•
2.18k
sesame/csm-1b
Updated
1 day ago
•
595
google/gemma-3-27b-it
Updated
2 days ago
•
85.2k
•
562
deepseek-ai/DeepSeek-R1
Updated
19 days ago
•
2.52M
•
11.3k
SparkAudio/Spark-TTS-0.5B
Updated
8 days 

In [37]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from company websites and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown. Include details of the company culture, customers and carrers/jobs if you have the information."

In [38]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relavant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt += user_prompt[:5_000] # limit to 5000 characters
    return user_prompt

In [39]:
get_brochure_user_prompt("HuggingFace", huggingface.url)

Found links: {'links': [{'type': 'About page', 'url': 'https://huggingface.co/'}, {'type': 'Company page', 'url': 'https://ui.endpoints.huggingface.co'}, {'type': 'Blog', 'url': 'https://discuss.huggingface.co'}, {'type': 'FAQs/Support', 'url': 'https://status.huggingface.co/'}, {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'}]}


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relavant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\n\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nQwen/QwQ-32B\nUpdated\n3 days ago\n•\n296k\n•\n2.18k\nsesame/csm-1b\nUpdated\n1 day ago\n•\n595\ngoogle/gemma-3-27b-it\nUpdated\n2 days ago\n•\n85.2k\n•\n562\ndeepseek-ai/DeepSeek-R1\nUpdated\n19 days ago\n•\n2.52M\n•\n11.3k\nSparkAudio/Spark-TTS-0.5B\nUpdated\n8 days ago\n•\n8.91k\n•\n401\nBrowse 1M+ models\nSpaces\nRunning\non\nZero\n242\n242\nSesame CSM\n🌱\nConversational speech generatio

In [40]:
def create_brochure(company_name, url):
    response = ollama.chat(model=MODEL, messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
    ])
    result = response.message.content
    return result

In [41]:
create_brochure("HuggingFace", huggingface.url)

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co'}, {'type': 'About page', 'url': 'https://huggingface.co/brand'}, {'type': 'FAQs/Support', 'url': 'https://huggingface.co/learn'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'About page', 'url': 'https://discuss.huggingface.co/'}]}


"Here's a written version of the webpage:\n\n**Hugging Face – The AI community building the future**\n\nWelcome to Hugging Face, the platform where the machine learning community collaborates on models, datasets, and applications. We provide tools for researchers and developers to build, share, and deploy AI models.\n\n**Explore 1M+ Models**\n\nBrowse our extensive collection of over 1 million pre-trained models, available for various tasks such as:\n\n* Natural Language Processing (NLP)\n* Computer Vision\n* Speech Recognition\n* Audio Processing\n\nDiscover new models, update existing ones, or create your own using our easy-to-use interfaces.\n\n**Trending on this week**\n\nCheck out the latest models and trends in AI research:\n\n* Qwen/QwQ-32B: A state-of-the-art language model for text generation\n* sesame/csm-1b: A conversational speech generation model\n* deepseek-ai/DeepSeek-R1: An advanced large-scale video generative model\n\n**Build your portfolio**\n\nShare your work with t

## Finally a minor improvement

In [53]:
def stream_brochure(company_name, url):
    stream = ollama.chat(model=MODEL, messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
    ], stream=True)
    response = ""

    display_handle = display(Markdown(""), display_id=True)
    for message in stream:
        response += message.message.content
        response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)
    display(Markdown(create_brochure(company_name, url)))

In [56]:
stream_brochure("ED", ed.url)

Found links: {'links': [{'type': 'company page', 'url': 'https://edwarddonner.com/'}, {'type': 'about page', 'url': 'https://edwarddonner.com/about-me-and-about-nebula/'}, {'type': 'blog posts page', 'url': 'https://edwarddonner.com/posts/'}]}


Here is a draft of the company page:

**Home - Edward Donner**

Welcome to Edward Donner's website!

I'm Ed, co-founder and CTO of Nebula.io, an AI-powered platform that helps recruiters source, understand, engage, and manage talent. Our mission is to help people discover their potential and pursue their reason for being.

**About Me**

As a passionate advocate for the power of AI, I've had the privilege of working in this field for over two decades. With a background in software engineering, I founded my first AI startup, untapt, in 2013. We built talent marketplaces and data science software for recruitment firms, specializing in tech roles in financial services.

After selling untapt in 2021, I joined forces with like-minded individuals to create Nebula.io. Our patented matching model has been hailed as a game-changer in the recruitment industry, allowing recruiters to match people with roles without relying on keywords.

**Our Mission**

At Nebula.io, we're committed to helping people find their dream jobs and pursue their passion. We believe that work should be fulfilling and successful, leading to higher levels of human prosperity. With 77% of people feeling uninspired or unengaged at work, it's time for a change.

**Our Technology**

Our cutting-edge AI technology uses Generative Adversarial Networks (GANs) and other machine learning algorithms to match people with roles that are tailored to their unique strengths and interests. No more relying on keywords or generic job descriptions – our model does the hard work for you.

**Get in Touch**

Want to learn more about how Nebula.io can help your organization? Or perhaps you're looking for advice on how to find your dream job? Reach out to me directly at [ed@edwarddonner.com](mailto:ed@edwarddonner.com) or follow me on social media:

* LinkedIn: linkedin.com/in/edwarddonner
* Twitter: twitter.com/edwarddonner
* Facebook: facebook.com/edwarddonner

**Stay Up-to-Date**

Subscribe to my newsletter for the latest updates on AI, recruitment, and personal development.

Type your email address below:

[subscribe form]

Found links: {'links': [{'type': 'company page', 'url': 'https://edwarddonner.com/'}, {'type': 'about page', 'url': 'https://edwarddonner.com/about-me-and-about-nebula/'}, {'type': 'blog posts', 'url': 'https://edwarddonner.com/posts/'}, {'type': 'LinkedIn profile', 'url': 'https://www.linkedin.com/in/eddonner/'}, {'type': 'Twitter profile', 'url': 'https://twitter.com/edwarddonner'}]}


Here is a brief summary of Edward Donner's background and work:

**About**

Edward Donner is the co-founder and CTO of Nebula.io, an AI startup that helps recruiters source, understand, engage, and manage talent. He is also a former founder and CEO of AI startup untapt, which was acquired in 2021.

**Work Experience**

* Co-Founder and CTO, Nebula.io: Applying AI to help people discover their potential and pursue their reason for being.
* Founder and CEO, untapt (2013): Built talent marketplaces and data science software for recruitment firms, specializing in tech roles in financial services.
* Previous experience as a software engineer and AI data scientist.

**Interests**

* DJing
* Amateur electronic music production
* Hacker News

**Contact Information**

* Email: ed[at]edwarddonner[dot]com
* Website: www.edwarddonner.com
* LinkedIn
* Twitter
* Facebook