In [2]:
!pip install requests beautifulsoup4 trafilatura



In [3]:
import trafilatura

def scrape_clean_text(url):
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        return trafilatura.extract(downloaded, include_comments=False, include_tables=False)
    else:
        return None


In [4]:
url = "https://docs.utilihive.io/utilihive-integration/"
content = scrape_clean_text(url)

if content:
    print(content[:1000])
else:
    print("❌ Failed to fetch or parse.")

Utilihive Integration
Learn about Utilihive
-
See what’s available in the Utilihive platform.
Start Developing
-
Get up and running with the tools to write flow configurations.
-
Jump into the detailed developer guides.
-
Ready to deploy a flow? Try the SDK Deployer.
Explore the Console
-
Monitor your data with Heartbeat.
-
Utility-specific needs? Check out the Utilihive Accelerators like Datalake and Ghostwriter.


In [5]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

def scrape_site(url, max_pages=5):
    visited = set()
    to_visit = [url]
    contents = []

    while to_visit and len(visited) < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        try:
            response = requests.get(current_url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            main_text = trafilatura.extract(response.text)

            if main_text:
                contents.append({
                    "url": current_url,
                    "text": main_text
                })

            visited.add(current_url)

            # Get internal links
            for a in soup.find_all("a", href=True):
                next_url = urljoin(url, a['href'])
                if url in next_url and next_url not in visited and next_url not in to_visit:
                    to_visit.append(next_url)

        except Exception as e:
            print(f"⚠️ Skipped {current_url}: {e}")

    return contents


In [6]:
pages = scrape_site("https://docs.utilihive.io/utilihive-integration/", max_pages=1000)
print(f"Scraped {len(pages)} pages.")
# print(pages[0]['text'][:1000])

Scraped 668 pages.


In [7]:
for page in pages:
    print(page['url'])
    # print(page['text'][:1000])
    print()

https://docs.utilihive.io/utilihive-integration/

https://docs.utilihive.io/utilihive-integration/core-concepts/

https://docs.utilihive.io/utilihive-integration/core-concepts/platform/

https://docs.utilihive.io/utilihive-integration/core-concepts/flows/

https://docs.utilihive.io/utilihive-integration/core-concepts/flow-server/

https://docs.utilihive.io/utilihive-integration/core-concepts/resources/

https://docs.utilihive.io/utilihive-integration/core-concepts/console/

https://docs.utilihive.io/utilihive-integration/getting-started/

https://docs.utilihive.io/utilihive-integration/getting-started/introduction/

https://docs.utilihive.io/utilihive-integration/getting-started/installation/

https://docs.utilihive.io/utilihive-integration/getting-started/advanced-setup/

https://docs.utilihive.io/utilihive-integration/getting-started/kotlin/

https://docs.utilihive.io/utilihive-integration/getting-started/first-flow/

https://docs.utilihive.io/utilihive-integration/getting-started/ex

In [9]:
pages[0]

{'url': 'https://docs.utilihive.io/utilihive-integration/',
 'text': 'Utilihive Integration\nLearn about Utilihive\n-\nSee what’s available in the Utilihive platform.\nStart Developing\n-\nGet up and running with the tools to write flow configurations.\n-\nJump into the detailed developer guides.\n-\nReady to deploy a flow? Try the SDK Deployer.\nExplore the Console\n-\nMonitor your data with Heartbeat.\n-\nUtility-specific needs? Check out the Utilihive Accelerators like Datalake and Ghostwriter.'}