In [1]:
from typing import List
import aiohttp
import urllib.parse
from bs4 import BeautifulSoup

from urllib.parse import unquote

from haystack.dataclasses import ByteStream
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import PyPDFToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter

In [2]:
def scrap_urls(url_list: List[str]) -> List[ByteStream]:
        streams = fetcher.run(urls=url_list)
        return streams["streams"]
    
async def fetch(url: str) -> bytes:
    """Get sitemap content from URL."""
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=10) as response:
                response.raise_for_status()
                return await response.read()
    except aiohttp.ClientError as e:
        logger.error("Failed to fetch sitemap: %s", e)

def get_soup(response, parser: str = "html.parser"):
    return BeautifulSoup(response, features=parser)
    
def contains_memento_url(tag):
    if tag.name == "a" and "href" in tag.attrs:
        href = tag["href"]
        decoded_href = unquote(href)
        return "Merkblätter/" in decoded_href
    return False

def remove_duplicate_links(links):
    seen_hrefs = set()
    unique_tags = []
    for tag in links:
        href = tag['href']
        if href not in seen_hrefs:
            seen_hrefs.add(href)
            unique_tags.append(tag)
    return unique_tags
    
def extract_urls_from_html(html: bytes) -> List[str]:

    soup = get_soup(html, "html.parser")

    # Find all "a" tags with href containing "Merkblätter/" (and subsequent path)
    links = soup.find_all(contains_memento_url)

    # Remove duplicate links
    links = remove_duplicate_links(links)

    links = [link["href"] for link in links]

    return links

def get_pdf_paths(soup):
    pdf_paths = [a["href"] for a in soup.find_all("a", {"class": "co-document-content"}) if "/p/" in a["href"]]
    #pdf_paths = [a["href"] for a in soup.find_all("a", {"class": "co-document-content"})]
    return pdf_paths

In [3]:
fetcher = LinkContentFetcher()
converter = PyPDFToDocument()
cleaner = DocumentCleaner(
            remove_empty_lines=True,
            remove_extra_whitespaces=True,
            remove_repeated_substrings=False)
splitter = DocumentSplitter(
            split_by="passage",
            split_length=1,
            split_overlap=0)

# Scrap https://ahv-iv.ch memento sitemap

In [4]:
sitemap_url = "https://www.ahv-iv.ch/de/Sitemap-DE"

sitemap = await fetch(sitemap_url)

url_list = extract_urls_from_html(sitemap)

response = scrap_urls(url_list)

soups = []
for res in response:
    soups.append(get_soup(res.data))

pdf_paths = []
for soup in soups:
    pdf_paths.extend(get_pdf_paths(soup))

# Get PDF paths

In [5]:
pdf_urls = ["https://ahv-iv.ch" + pdf_path for pdf_path in pdf_paths]
pdf_urls[:5]

['https://ahv-iv.ch/p/1.01.d',
 'https://ahv-iv.ch/p/1.02.d',
 'https://ahv-iv.ch/p/1.03.d',
 'https://ahv-iv.ch/p/1.04.d',
 'https://ahv-iv.ch/p/1.05.d']

In [6]:
pdf_urls.extend([pdf_url.replace(".d", ".f") for pdf_url in pdf_urls])
pdf_urls.extend([pdf_url.replace(".d", ".i") for pdf_url in pdf_urls])
pdf_urls

['https://ahv-iv.ch/p/1.01.d',
 'https://ahv-iv.ch/p/1.02.d',
 'https://ahv-iv.ch/p/1.03.d',
 'https://ahv-iv.ch/p/1.04.d',
 'https://ahv-iv.ch/p/1.05.d',
 'https://ahv-iv.ch/p/1.07.d',
 'https://ahv-iv.ch/p/2.01.d',
 'https://ahv-iv.ch/p/2.02.d',
 'https://ahv-iv.ch/p/2.03.d',
 'https://ahv-iv.ch/p/2.04.d',
 'https://ahv-iv.ch/p/2.05.d',
 'https://ahv-iv.ch/p/2.06.d',
 'https://ahv-iv.ch/p/2.07.d',
 'https://ahv-iv.ch/p/2.08.d',
 'https://ahv-iv.ch/p/2.09.d',
 'https://ahv-iv.ch/p/2.10.d',
 'https://ahv-iv.ch/p/2.11.d',
 'https://ahv-iv.ch/p/2.12.d',
 'https://ahv-iv.ch/p/31.d',
 'https://ahv-iv.ch/p/3.01.d',
 'https://ahv-iv.ch/p/3.02.d',
 'https://ahv-iv.ch/p/3.03.d',
 'https://ahv-iv.ch/p/3.04.d',
 'https://ahv-iv.ch/p/3.05.d',
 'https://ahv-iv.ch/p/3.06.d',
 'https://ahv-iv.ch/p/3.07.d',
 'https://ahv-iv.ch/p/3.08.d',
 'https://ahv-iv.ch/p/4.01.d',
 'https://ahv-iv.ch/p/4.02.d',
 'https://ahv-iv.ch/p/4.03.d',
 'https://ahv-iv.ch/p/4.04.d',
 'https://ahv-iv.ch/p/4.05.d',
 'https://

In [7]:
response = scrap_urls(pdf_urls[:5])

# Scrap PDFs

In [8]:
docs = converter.run(sources=response)

In [9]:
docs["documents"]

[Document(id=6910366e25bf9cc83d320ecc6edf5e3ab575067ff0c46abd7d98b53329229ea2, content: '1.01 Allgemeines
 Auszug aus dem  
 Stand am 1. Januar 2015Individuellen Konto (IK)
 2Auf einen Blick
 D...', meta: {'content_type': 'application/pdf', 'url': 'https://ahv-iv.ch/p/1.01.d'}),
 Document(id=30c414533cf702df3517a8141ab0bad4ec8254c1b5135b34808e2c7538f045e8, content: '1.02 Allgemeines
 Splitting bei Scheidung
 Stand am 1. Januar 2024
 2Auf einen Blick
 Splitting ist die ...', meta: {'content_type': 'application/pdf', 'url': 'https://ahv-iv.ch/p/1.02.d'}),
 Document(id=3b592abac551ea224beec93069b927cdf82c825e5b185c6039ec62dfd2a090b1, content: '1.03 Allgemeines
 Betreuungsgutschriften
 Stand am 1. Januar 2021
 2Auf einen Blick
 Die gesetzlichen Be...', meta: {'content_type': 'application/pdf', 'url': 'https://ahv-iv.ch/p/1.03.d'}),
 Document(id=7b7c616c6d019f0f0f4e54f5041adb97c52fb31b18312f3f6675be5dd14d96f8, content: 'Seiten  
 2-6Erläuterungen zum  
 Auszug aus dem  
 Individuellen Konto

# Clean document

In [11]:
docs = cleaner.run(documents=docs["documents"])

# Split document

In [13]:
docs = splitter.run(documents=docs["documents"])