In [2]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [3]:
load_dotenv(override=True)
MODEL ='llama3.2'

In [16]:
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script","style","img","input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
        

In [17]:
page = Website("https://huggingface.co")

page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/Lightricks/LTX-Video',
 '/nari-labs/Dia-1.6B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/smolagents/computer-agent',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/DreamO',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',


In [19]:
link_system_prompt =  "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages.\n"
link_system_prompt += "You should response only in JSON, without text, object as in this example:"
link_system_prompt +="""
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should response only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [21]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} -"
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URl in clean JSON format \
    without text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co -please decide which of these are relevant web links for     a brochure about the company, respond with the full https URl in clean JSON format     without text json on the beginning of the response.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/nvidia/parakeet-tdt-0.6b-v2
/ACE-Step/ACE-Step-v1-3.5B
/Lightricks/LTX-Video
/nari-labs/Dia-1.6B
/lodestones/Chroma
/models
/spaces/smolagents/computer-agent
/spaces/enzostvs/deepsite
/spaces/ByteDance/DreamO
/spaces/ACE-Step/ACE-Step
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces
/datasets/DMindAI/DMind_Benchmark
/datasets/nvidia/OpenCodeReasoning
/datasets/nvidia/OpenMathReasoning
/datasets/nvidia/Nemotron-CrossThink
/datasets/openbmb/Ultra-FineWeb
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enterp

In [24]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"}
    )
    result = response['message']['content']
    print(result)
    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedz nie jest poprawnym JSON")

huggingface = Website("https://huggingface.co")
huggingface.links

get_links("https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"}
    ],
    "datasets": [
        {"type": "Hugging Face Benchmark", "url": "https://datasets.huggingface.co/datasets"},
        {"type": "OpenCodeReasoning", "url": "https://huggingface.co/datasets/nvidia/OpenCodeReasoning"}
    ],
    "blog": {
        "type": "Blog page",
        "url": "https://blog.huggingface.co/"
    },
    "docs": [
        {"type": "Transformers documentation", "url": "https://huggingface.co/docs/transformers"},
        {"type": "Diffusers documentation", "url": "https://huggingface.co/docs/diffusers"}
    ],
    "GitHub": {
        "type": "GitHub repository",
        "url": "https://github.com/huggingface"
    },
    "Twitter": {
        "type": "Twitter handle",
        "url": "https://twitter.com/huggingface"
    },
 

{'links': [{'type': 'About page', 'url': 'https://huggingface.co/'},
  {'type': 'Company page', 'url': 'https://huggingface.co/brand'},
  {'type': 'Careers/Jobs page',
   'url': 'https://apply.workable.com/huggingface/'}],
 'datasets': [{'type': 'Hugging Face Benchmark',
   'url': 'https://datasets.huggingface.co/datasets'},
  {'type': 'OpenCodeReasoning',
   'url': 'https://huggingface.co/datasets/nvidia/OpenCodeReasoning'}],
 'blog': {'type': 'Blog page', 'url': 'https://blog.huggingface.co/'},
 'docs': [{'type': 'Transformers documentation',
   'url': 'https://huggingface.co/docs/transformers'},
  {'type': 'Diffusers documentation',
   'url': 'https://huggingface.co/docs/diffusers'}],
 'GitHub': {'type': 'GitHub repository',
  'url': 'https://github.com/huggingface'},
 'Twitter': {'type': 'Twitter handle',
  'url': 'https://twitter.com/huggingface'},
 'LinkedIn': {'type': 'LinkedIn page',
  'url': 'https://www.linkedin.com/company/huggingface/'}}

In [27]:
def get_all_details(url):
    result = "Landing page:\n"
    result +=Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        try:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
        except socket.gaierror as e:
            print(f"DNS resolution failes: {e}")
        except NameResolutionError as e:
            print(f"Name resolution error: {e}")
        except MaxRetryError as e:
            print(f"Max retries exceeded: {e}")
        except ConnectionError as e:
            print(f"Connection error: {e}")
        except MissingSchema as e:
            print(f"Invalid URL schema: {e}")
        except InvalidSchema as e:
            print(f"Omitted unsupported URL (InvalidSchema): {e}")
    return result

print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"}
    ]
}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
nvidia/parakeet-tdt-0.6b-v2
Updated
13 days ago
•
109k
•
794
ACE-Step/ACE-Step-v1-3.5B
Updated
about 23 hours ago
•
395
Lightricks/LTX-Video
Updated
8 days ago
•
250k
•
1.45k
nari-labs/Dia-1.6B
Updated
about 9 hours ago
•
159k
•
2.11k
lodestones/Chroma
Updated
about 4 hours ago
•
476
Browse 1M+ models
Spaces
Running
on
CPU Upgrade
540
540
Computer Agent
🖥
Run 

In [30]:
system_prompt ="You are an assistant that analyzes the content of several relevent pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Iclude details of company culture, customers and careers/jobs if you have the information."


In [33]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\m"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

get_brochure_user_prompt("HuggingFace", "https://huggingface.co")
    

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company/Brand page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Blog page", "url": "https://discuss.huggingface.co"},
        {"type": "Status page", "url": "https://status.huggingface.co/"},
        {"type": "GitHub repository", "url": "https://github.com/huggingface"},
        {"type": "Twitter handle", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn company page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}


'You are looking at a company called: HuggingFace\\mHere are the contents of its landing page and other relevant pages;     use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnvidia/parakeet-tdt-0.6b-v2\nUpdated\n13 days ago\n•\n109k\n•\n794\nACE-Step/ACE-Step-v1-3.5B\nUpdated\nabout 23 hours ago\n•\n395\nLightricks/LTX-Video\nUpdated\n8 days ago\n•\n250k\n•\n1.45k\nnari-labs/Dia-1.6B\nUpdated\nabout 9 hours ago\n•\n159k\n•\n2.11k\nlodestones/Chroma\nUpdated\nabout 4 hours ago\n•\n476\nBrowse 1M+ models\nSpaces\nRunning\non\nCPU Upgrade\n540\n540\nComputer Age

In [35]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content" : system_prompt},
            {"role": "user:", "content" : get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    display(Markdown(result))

create_brochure("HuggingFace", "https://huggingface.co")

{
    "links": [
        {"type": "About", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Jobs", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Products/Services", "url": "https://huggingface.co/pricing#endpoints"},
        {"type": "Pricing", "url": "https://huggingface.co/pricing"},
        {"type": "Enterprise", "url": "https://huggingface.co/enterprise"},
        {"type": "Products/Datasets", "url": "https://huggingface.co/datasets"},
        {"type": "Products/Speech Models", "url": "https://nvidia.com/en-us/gradients/models/hugging-face.html"},
        {"type": "GitHub Repository", "url": "https://github.com/huggingface"},
        {"type": "Twitter Handle", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn Company Page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}



In [42]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content" : system_prompt},
            {"role": "user:", "content" : get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    result =""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ''
        result = result.replace("```","").replace("markdown","")
        update_display(Markdown(result),display_id=display_handle.display_id)
        
            

In [43]:
stream_brochure("HuggingFace", "https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Blog page", "url": "https://discuss.huggingface.co/"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}


