In [29]:
import requests
import json
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

In [4]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [5]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [6]:
def simple_chat(message, model="llama3.2"):
    """Simple chat completion with Ollama local model"""
    try:
        payload = {
            "model": model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": message}
            ],
            "stream": False,
            "options": {
                "temperature": 0.7,
                "num_predict": 1000  # Ollama uses num_predict instead of max_tokens
            }
        }
        
        response = requests.post(OLLAMA_API, headers=HEADERS, json=payload)
        response.raise_for_status()  # Raises an exception for bad status codes
        
        result = response.json()
        return result["message"]["content"]
        
    except requests.exceptions.RequestException as e:
        return f"Network Error: {str(e)}"
    except KeyError as e:
        return f"Response format error: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"
        

In [7]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [8]:
# Some websites need you to use proper headers when fetching them:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [9]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-

In [10]:
# summarize function for Ollama 
def summarize(url):
    website = Website(url)
    user_message = user_prompt_for(website)  # Using the user_prompt_for function
     # Using simple_chat function with Ollama
    response = simple_chat(user_message)
    return response

In [11]:
# Test it
result = summarize("https://www.google.com")
display(Markdown(result))

# Google Website Summary

## Overview

The Google website provides various services and features, including:

* **Gmail**: a popular email service
* **Images**: a search engine for images
* **Advertising**, **Business**, and **How Search works**: sections detailing Google's advertising policies, business offerings, and how its search algorithm works

## News/Announcements

Unfortunately, there are no news or announcements on this website. It appears to be a collection of services and features rather than a platform for sharing updates.

## Language Options

The website offers translations in **اردو**, **پښتو**, and **سنڌي** (Urdu, Pashto, and Sindhi), catering to users from various regions.

## Other Notable Features

* The website has **Settings** and **Privacy** sections, allowing users to manage their account settings and data protection.
* Users can also access their **Search history**, **Send feedback**, and **Dark theme** preferences.

In [12]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [13]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [14]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)

    return user_prompt

In [15]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddo

In [16]:
def get_links(url):
    website = Website(url)
    # Make request to local Ollama instead of OpenAI
    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.2',  # or whatever model you have
                               'prompt': f"""System: You are a helpful assistant that extracts links from website content.

User: Please extract all links from this website content: {get_links_user_prompt(website)}

Return as JSON format: {{"links": [...]}}""",
                               'stream': False,
                               'format': 'json'
                           })
    
    result = response.json()
    try:
        parsed_result = json.loads(result['response'])
        return parsed_result
    except json.JSONDecodeError:
        print("Error parsing JSON:", result['response'])
        return {"links": []}  # Return empty structure on error

In [17]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nanonets/Nanonets-OCR-s',
 '/mistralai/Magistral-Small-2506',
 '/echo840/MonkeyOCR',
 '/tencent/Hunyuan3D-2.1',
 '/Menlo/Jan-nano',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/aisheets/sheets',
 '/spaces/ilcve21/Sparc3D',
 '/spaces/ResembleAI/Chatterbox',
 '/spaces/multimodalart/wan2-1-fast',
 '/spaces',
 '/datasets/nvidia/Nemotron-Personas',
 '/datasets/institutional/institutional-books-1.0',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/open-thoughts/OpenThoughts3-1.2M',
 '/datasets/miriad/miriad-5.8M',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',
 '/

In [18]:
get_links("https://huggingface.co")

{'links': ['https://endpoints.huggingface.co',
  'https://discuss.huggingface.co',
  'https://status.huggingface.co/',
  'https://github.com/huggingface',
  'https://twitter.com/huggingface',
  'https://www.linkedin.com/company/huggingface/',
  'https://join.discord.com',
  'https://www.zhihu.com/org/huggingface']}

In [21]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\nLink: {link}\n"
        result += Website(link).get_contents()
    return result

In [22]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': ['https://huggingface.co/docs/transformers', 'https://huggingface.co/docs/diffusers', 'https://huggingface.co/docs/safetensors', 'https://huggingface.co/docs/huggingface_hub', 'https://huggingface.co/docs/tokenizers', 'https://huggingface.co/docs/trl', 'https://huggingface.co/models', 'https://huggingface.co/datasets', 'https://huggingface.co/spaces', 'https://huggingface.co/pricing', 'https://huggingface.co/enterprise', 'https://huggingface.co/allenai', 'https://huggingface.co/facebook', 'https://huggingface.co/amazon', 'https://huggingface.co/google', 'https://huggingface.co/Intel', 'https://huggingface.co/microsoft', 'https://huggingface.co/grammarly', 'https://huggingface.co/Writer']}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collab

In [23]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."|

In [24]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [25]:
def create_brochure(company_name, url):
    response = requests.post('http://localhost:11434/api/generate',
                       json={
                           'model': 'llama3.2',  # or whatever model you have
                           'prompt': f"System: {system_prompt}\n\nUser: {get_brochure_user_prompt(company_name, url)}",
                           'stream': False
                       })
    result = response.json()['response']
    display(Markdown(result))

In [26]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': ['https://huggingface.co/models', 'https://huggingface.co/datasets', 'https://huggingface.co/spaces', 'https://huggingface.co/docs', 'https://huggingface.co/enterprise', 'https://huggingface.co/pricing', 'https://huggingface.co/allenai', 'https://huggingface.co/facebook', 'https://huggingface.co/amazon', 'https://huggingface.co/google', 'https://huggingface.co/Intel', 'https://huggingface.co/microsoft', 'https://huggingface.co/grammarly', 'https://huggingface.co/Writer', 'https://huggingface.co/docs/transformers', 'https://huggingface.co/docs/diffusers', 'https://huggingface.co/docs/safetensors', 'https://huggingface.co/docs/huggingface_hub', 'https://huggingface.co/docs/tokenizers', 'https://huggingface.co/docs/trl', 'https://huggingface.co/models', 'https://huggingface.co/datasets', 'https://huggingface.co/spaces', 'https://huggingface.co/changelog']}


# Hugging Face Brochure
======================

Welcome to Hugging Face, the AI community building the future.

## About Us
----------------

Hugging Face is a collaboration platform for machine learning that enables creators to host and collaborate on unlimited public models, datasets, and applications. Our mission is to make AI accessible to everyone.

## Community
-------------

Our community of over 50,000 organizations uses Hugging Face to build and deploy AI models. We're proud to be used by top companies like Meta, Google, Amazon, Intel, Microsoft, and Grammarly.

## Products
------------

* **Models**: Browse 1M+ pre-trained models for text, image, video, audio, or 3D applications.
* **Datasets**: Access 250k+ public datasets for any ML task.
* **Spaces**: Collaborate on unlimited public models, datasets, and applications.

## Culture
---------

Our culture is built around open-source values, community engagement, and innovation. We believe in making AI accessible to everyone and building a platform that fosters collaboration and creativity.

### Key Values

* **Open-source**: We're committed to open-source software and data.
* **Community-driven**: Our platform is built by and for the machine learning community.
* **Innovation**: We strive to innovate and push the boundaries of what's possible with AI.

## Careers
---------

Ready to join our team? Check out our [job openings](https://huggingface.co/jobs).

### Why Work at Hugging Face?

* **Collaborative environment**: Join a community of passionate machine learning enthusiasts.
* **Innovative work**: Be part of building the future of AI and making a real impact.
* **Professional development**: Grow your skills and expertise in the latest ML trends.

## Resources
------------

* [Documentation](https://huggingface.co/docs)
* [Blog](https://huggingface.co/blog)
* [GitHub](https://github.com/huggingface)

Join us on our mission to build a better future with AI.

In [32]:
#Finally - a minor improvement
#With a small adjustment, we can change this so that the results stream back from OpenAI, with the familiar typewriter animation
def stream_brochure(company_name, url):
    r = requests.post('http://localhost:11434/api/generate',
                     json={
                         'model': 'llama3.2',
                         'prompt': f"System: {system_prompt}\n\nUser: {get_brochure_user_prompt(company_name, url)}",
                         'stream': True
                     }, stream=True)
    
    result = ""
    display_handle = display(Markdown(""), display_id=True)
    
    for line in r.iter_lines():
        if line:
            chunk = json.loads(line)
            if 'response' in chunk:
                result += chunk['response']
                display_handle.update(Markdown(result.replace("```", "").replace("markdown", "")))
            if chunk.get('done'):
                break
    
    return result

In [33]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': ['https://huggingface.co/models', 'https://huggingface.co/datasets', 'https://huggingface.co/spaces', 'https://huggingface.co/docs', 'https://huggingface.co/enterprise', 'https://huggingface.co/pricing', 'https://huggingface.co/allenai', 'https://huggingface.co/facebook', 'https://huggingface.co/amazon', 'https://huggingface.co/google', 'https://huggingface.co/Intel', 'https://huggingface.co/microsoft', 'https://huggingface.co/grammarly', 'https://huggingface.co/Writer', 'https://huggingface.co/docs/transformers', 'https://huggingface.co/docs/diffusers', 'https://huggingface.co/docs/safetensors', 'https://huggingface.co/docs/huggingface_hub', 'https://huggingface.co/docs/tokenizers', 'https://huggingface.co/docs/trl', 'https://huggingface.co/models', 'https://huggingface.co/datasets', 'https://huggingface.co/spaces', 'https://huggingface.co/changelog', 'https://endpoints.huggingface.co', 'https://discuss.huggingface.co', 'https://status.huggingface.co', 'https://

# Hugging Face Brochure

## About Us

Hugging Face is a leading open-source platform for natural language processing (NLP) and machine learning (ML). Our mission is to build the future of AI by creating a collaborative community that explores, develops, and deploys cutting-edge ML technologies.

## Our Culture

At Hugging Face, we value collaboration, innovation, and diversity. Our culture is centered around the idea of "Open Research" - where everyone can contribute, share, and learn from each other's work. We believe in fostering a community-driven approach to AI development, ensuring that our platform remains accessible, transparent, and inclusive.

## Customers

We serve a wide range of customers across various industries, including:

*   **Companies**: Meta, Amazon, Google, Intel, Microsoft, Grammarly, Writer
*   **Non-Profits**: AI2
*   **Government Agencies**: (Not specified)
*   **Academia**: (Not specified)

## Careers

If you're passionate about AI and want to contribute to a growing community of researchers and developers, we invite you to explore our job opportunities:

*   [Jobs](https://huggingface.co/jobs)
*   [Internships](https://huggingface.co/internships)

We also offer various resources for learning and development, including:

*   **Documentation**: <https://huggingface.co/docs>
*   **Tutorials**: <https://huggingface.co/tutorials>
*   **Blog**: <https://huggingface.co/blog>

## Products

Our flagship products include:

*   **Hugging Face Hub**: A platform for hosting, sharing, and collaborating on ML models, datasets, and applications.
*   **Transformers**: State-of-the-art ML library for PyTorch, TensorFlow, JAX, and more.
*   **Diffusers**: State-of-the-art diffusion models in PyTorch.

## Pricing

We offer a range of pricing plans to suit your needs:

*   **Compute**: Starting at $0.000001 per batch
*   **Inference Endpoints**: Starting at $0.0001 per request

'# Hugging Face Brochure\n\n## About Us\n\nHugging Face is a leading open-source platform for natural language processing (NLP) and machine learning (ML). Our mission is to build the future of AI by creating a collaborative community that explores, develops, and deploys cutting-edge ML technologies.\n\n## Our Culture\n\nAt Hugging Face, we value collaboration, innovation, and diversity. Our culture is centered around the idea of "Open Research" - where everyone can contribute, share, and learn from each other\'s work. We believe in fostering a community-driven approach to AI development, ensuring that our platform remains accessible, transparent, and inclusive.\n\n## Customers\n\nWe serve a wide range of customers across various industries, including:\n\n*   **Companies**: Meta, Amazon, Google, Intel, Microsoft, Grammarly, Writer\n*   **Non-Profits**: AI2\n*   **Government Agencies**: (Not specified)\n*   **Academia**: (Not specified)\n\n## Careers\n\nIf you\'re passionate about AI and