# Notes

- You need to run `docker-compose up` to initialize the db

In [None]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from config.base_config import rag_config
from rag.rag_processor import processor
from rag.rag_processor import llm_client
from rag.models import RAGRequest

from indexing.pipelines.ahv import ahv_indexer
from database.service import document_service
from schemas.document import DocumentCreate

import tiktoken
import pandas as pd
import matplotlib.pyplot as plt
import tqdm

### Define utilitary functions

In [2]:
POSTGRES_USER = os.environ.get("POSTGRES_USER", None)
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", None)
POSTGRES_PORT = os.environ.get("POSTGRES_PORT", None)
POSTGRES_DB = os.environ.get("POSTGRES_DB", None)

In [3]:
def get_db():

    DATABASE_URL = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/{POSTGRES_DB}"

    engine = create_engine(DATABASE_URL)

    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

    db = SessionLocal()

    return db

### Setup config

In [4]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [None]:
rag_config

### Connect to db

In [6]:
db = get_db()

# Scraping/Indexing

### GOAL: CHUNK PDFS BY SECTION
    - FAMIILIENZULAGEN
        - ANSPRUCH, UNTERSTELLUNG, etc.
    - BEITRAGE:
        - ...

If doesn't work -> try recursive summarization -> BUT ARE SECTIONS EXCLUSIVE?

Need to find sections for each PDF (manual task)

In [None]:
from indexing.scraper import scraper
from indexing.pipelines.ahv import AHVParser
from bs4 import BeautifulSoup

from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import (
    LTTextContainer,
    LTChar,
    LTTextLine,
    LTTextLineHorizontal,
)
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import resolve1
import json
from itertools import groupby

def extract_urls(pdf_bytes):
    """
    Extracts URLs from a PDF byte stream in the order they appear.
    """
    urls = []

    parser = PDFParser(BytesIO(pdf_bytes))
    document = PDFDocument(parser)

    # Iterate through pages to extract annotations
    for page in PDFPage.create_pages(document):
        annotations = page.annots
        if annotations:
            annotations = resolve1(annotations)
            for annotation in annotations:
                annotation_data = resolve1(annotation)
                # Check if the annotation is a link
                if annotation_data.get('Subtype').name == 'Link':
                    action = annotation_data.get('A')
                    if action and action.resolve().get('S').name == 'URI':
                        uri = action.resolve().get('URI')
                        if uri:
                            urls.append(uri.decode('utf-8'))
    return urls

def process_paragraph(chars, url_list, url_index):
    """
    Processes a paragraph, replacing italic text with markdown links.

    Parameters:
    - chars: List of LTChar objects in the paragraph.
    - url_list: List of URLs extracted from the PDF.
    - url_index: Current index in the URL list.

    Returns:
    - A dictionary with updated text and url_index.
    """
    new_paragraph_parts = []

    # Group chars by their style (italic or regular)
    for is_italic, group in groupby(chars, key=lambda c: 'Italic' in c.fontname):
        text = ''.join(c.get_text() for c in group)
        text = ' '.join(text.split())  # Remove extra spaces
        if is_italic:  # Italic text
            # Get the next URL from the list if available
            if url_index < len(url_list):
                url = url_list[url_index]
                url_index += 1
                # Replace italic text with markdown link
                new_paragraph_parts.append(f"[{text}]({url})")
            else:
                # No more URLs; keep italic text as is
                new_paragraph_parts.append(text)
        else:
            new_paragraph_parts.append(text)

    new_paragraph = ' '.join(new_paragraph_parts)
    # Remove any extra spaces in the final paragraph
    new_paragraph = ' '.join(new_paragraph.split())

    return {'text': new_paragraph, 'url_index': url_index}

def clean_subsections(extraction):
    """
    Cleans up subsection keys and joins paragraph lists into strings.

    Parameters:
    - extraction: The nested dictionary of extracted sections and subsections.

    Returns:
    - extraction: The cleaned and updated nested dictionary.
    """
    # Iterate over each section and its subsections
    for section, subsections in extraction.items():
        keys_to_update = {}
        keys_to_delete = []
        for key in subsections:
            # Strip leading numbers and whitespace from subsection keys
            new_key = key.lstrip("0123456789 ").strip()
            if new_key:
                keys_to_update[key] = new_key
            else:
                keys_to_delete.append(key)
        # Update subsection keys outside the loop to avoid modifying the dict during iteration
        for old_key, new_key in keys_to_update.items():
            subsections[new_key] = subsections.pop(old_key)
        # Delete empty keys
        for key in keys_to_delete:
            subsections.pop(key)

    # Join paragraph lists into single strings
    for subsections in extraction.values():
        for sub_key in subsections:
            if isinstance(subsections[sub_key], list):
                subsections[sub_key] = ' '.join(subsections[sub_key])
                # Remove extra spaces in the concatenated paragraphs
                subsections[sub_key] = ' '.join(subsections[sub_key].split())

    return extraction

def extract(pdf_bytes, topic, pdf_url, save_json=False):
    """
    Extracts structured text from a PDF byte stream and replaces italic text with markdown links.
    Handles multiline sections and subsections regardless of the starting character's case.

    Parameters:
    - pdf_bytes: Bytes of the PDF file.
    - save_json: Boolean flag to save the extraction result as a JSON file.

    Returns:
    - extraction: A nested dictionary containing sections, subsections, and paragraphs.
    """
    pdf_stream = BytesIO(pdf_bytes)

    extraction = {}
    section = ""
    subsection = ""
    current_section_lines = []
    current_subsection_lines = []

    # Extract URLs in order
    url_list = extract_urls(pdf_bytes)
    url_index = 0  # To keep track of the current URL

    # Iterate over pages in the PDF
    for page_layout in extract_pages(pdf_stream):
        page_number = page_layout.pageid

        # Skip the first page
        if page_number == 1:
            continue

        # Iterate over elements in the page layout
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # For each text line or character within the text container
                for obj in element:
                    # Handle cases where obj is a text line or character
                    if isinstance(obj, (LTTextLine, LTTextLineHorizontal)):
                        text_line = obj
                    elif isinstance(obj, LTChar):
                        # Wrap the character in a list to process it as a single-character line
                        text_line = [obj]
                    else:
                        continue  # Skip other types

                    # Extract characters from the text line
                    if isinstance(text_line, list):
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    else:
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    if not chars:
                        continue  # Skip if there are no characters

                    # Get character sizes and fonts
                    char_sizes = [char.size for char in chars]
                    char_fonts = [
                        char.graphicstate.ncolor
                        for char in chars
                        if hasattr(char.graphicstate, 'ncolor')
                    ]

                    # Define thresholds and font colors for different text types
                    text_styles = {
                        'section': {'size': 12.5, 'color': [1]},
                        'subsection': {'size': 11, 'color': [0, 0, 0, 1]},
                        'paragraph': {'size': 10, 'color': [0, 0, 0, 1]},
                        'break': {'size': 14, 'color': [0, 0, 0, 1]}
                    }

                    # Function to check if text line matches a style
                    def matches_style(style):
                        return (
                            all(abs(size - style['size']) < 0.1 for size in char_sizes) and
                            all(font == style['color'] for font in char_fonts)
                        )

                    # Check for break condition
                    if matches_style(text_styles['break']):
                        # Process any pending subsection
                        if current_subsection_lines:
                            subsection_text = ' '.join(current_subsection_lines)
                            if section:
                                if subsection_text not in extraction[section]:
                                    extraction[section][subsection_text] = []
                                subsection = subsection_text
                            current_subsection_lines = []

                        # Process any pending section
                        if current_section_lines:
                            section_text = ' '.join(current_section_lines)
                            section_text = ' '.join(section_text.split())  # Remove extra spaces
                            extraction[section_text] = {"main": []}
                            section = section_text
                            current_section_lines = []

                        # Clean up
                        extraction = clean_subsections(extraction)
                        if save_json:
                            with open('./extraction.json', 'w', encoding="utf-8") as fp:
                                json.dump(extraction, fp, ensure_ascii=False)
                        # Reset variables to start processing next section
                        section = ""
                        subsection = ""
                        continue  # Continue processing the next lines

                    # Check if the text line is a section header
                    if matches_style(text_styles['section']):
                        # Append the text line to current_section_lines
                        text_line_text = ''.join(char.get_text() for char in chars)
                        text_line_text = ' '.join(text_line_text.split())  # Remove extra spaces
                        current_section_lines.append(text_line_text)
                        continue

                    else:
                        # If current_section_lines is not empty, process the section
                        if current_section_lines:
                            # Process any pending subsection
                            if current_subsection_lines:
                                subsection_text = ' '.join(current_subsection_lines)
                                if section:
                                    if subsection_text not in extraction[section]:
                                        extraction[section][subsection_text] = []
                                    subsection = subsection_text
                                current_subsection_lines = []

                            # Join the collected section lines to form the section title
                            section_text = ' '.join(current_section_lines)
                            section_text = ' '.join(section_text.split())  # Remove extra spaces
                            extraction[section_text] = {"main": []}
                            section = section_text
                            subsection = ""
                            current_section_lines = []

                    # Filter out subsection numbers (e.g., "[0, 0, 0, 0.3]")
                    filtered_fonts = [font for font in char_fonts if font != [0, 0, 0, 0.3]]

                    # Check if the text line is a subsection header
                    if (
                        all(abs(size - text_styles['subsection']['size']) < 0.1 for size in char_sizes) and
                        all(font == text_styles['subsection']['color'] for font in filtered_fonts)
                    ):
                        text_line_text = ''.join(char.get_text() for char in chars)
                        text_line_text = ' '.join(text_line_text.split())  # Remove extra spaces
                        current_subsection_lines.append(text_line_text)
                        continue

                    else:
                        # Process any pending subsection
                        if current_subsection_lines:
                            subsection_text = ' '.join(current_subsection_lines)
                            if section:
                                if subsection_text not in extraction[section]:
                                    extraction[section][subsection_text] = []
                                subsection = subsection_text
                            current_subsection_lines = []

                    # Check if the text line is a paragraph
                    if matches_style(text_styles['paragraph']):
                        # If section is empty, skip processing the paragraph
                        if not section:
                            continue

                        # Process the paragraph and replace italic text with markdown links
                        paragraph = process_paragraph(
                            chars, url_list, url_index
                        )
                        url_index = paragraph['url_index']  # Update the URL index
                        paragraph_text = paragraph['text']

                        # Remove leading/trailing numbers and spaces
                        paragraph_text = paragraph_text.strip("0123456789 ").strip()
                        paragraph_text = ' '.join(paragraph_text.split())  # Remove extra spaces

                        # Append paragraph to the appropriate section and subsection
                        if subsection:
                            extraction[section][subsection].append(paragraph_text)
                        else:
                            extraction[section]["main"].append(paragraph_text)

        # After all lines are processed on the page, check for any pending subsection
        if current_subsection_lines:
            subsection_text = ' '.join(current_subsection_lines)
            if section:
                if subsection_text not in extraction[section]:
                    extraction[section][subsection_text] = []
                subsection = subsection_text
            current_subsection_lines = []

        # Also check for any pending section
        if current_section_lines:
            section_text = ' '.join(current_section_lines)
            section_text = ' '.join(section_text.split())  # Remove extra spaces
            extraction[section_text] = {"main": []}
            section = section_text
            current_section_lines = []

    # Clean and finalize the extraction
    extraction = clean_subsections(extraction)
    if save_json:
        filename = pdf_url.split("/")[-1]
        with open(f'indexing/data/ahv_parsed/{topic}/{filename}.json', 'w', encoding="utf-8") as fp:
            json.dump(extraction, fp, ensure_ascii=False)
    return extraction

async def extract_pdf_content(topic):
    parser = AHVParser()

    sitemap_url = "https://www.ahv-iv.ch/de/Sitemap-DE"

    sitemap = await scraper.fetch(sitemap_url)
    url_list = parser.parse_urls(sitemap)

    topics = ["Allgemeines",
          "Beiträge-AHV-IV-EO-ALV",
          "Leistungen-der-AHV",
          "Leistungen-der-IV",
          "Ergänzungsleistungen-zur-AHV-und-IV",
          "Überbrückungsleistungen",
          "Leistungen-der-EO-MSE-EAE-BUE-AdopE",
          "Familienzulagen",
          "International",
          "Andere-Sozialversicherungen",
          #"Jährliche-Neuerungen"
         ]

    section_to_scrap = url_list[[i for i, url in enumerate(url_list) if topic in url][0]]
    print(section_to_scrap)

    content = scraper.scrap_urls([section_to_scrap])

    soups = []
    for page in content:
        soups.append(BeautifulSoup(page.data, features="html.parser"))

    # Get PDF paths from each memento section
    pdf_paths = []
    for soup in soups:
        pdf_paths.extend(parser.get_pdf_paths(soup))

    # Scrap PDFs from each memento section
    pdf_urls = ["https://www.ahv-iv.ch" + pdf_path for pdf_path in pdf_paths]

    # Add "it", "fr" pdf paths
    pdf_urls.extend([pdf_url.replace(".d", ".f") for pdf_url in pdf_urls])
    pdf_urls.extend([pdf_url.replace(".d", ".i") for pdf_url in pdf_urls])

    pdf_urls = list(set(pdf_urls))
    print(pdf_urls)

    content = scraper.scrap_urls(pdf_urls)

    for c in content:
        pdf_bytes = c.data
        pdf_url = c.meta["url"]
        print(pdf_url)
        print(topic)
        extraction = extract(pdf_bytes, topic, pdf_url, save_json=True)


In [None]:
await extract_pdf_content(topic="Andere-Sozialversicherungen")

### Upsert to db

#### Name mapping

In [17]:
import json
import re

In [97]:
mapping = {"1.01": {"topic": "Extrait du Compte Individuel (CI)",
          "last_modification": "01.01.2024",
          "etat": "01.01.2015"},
 "1.02": {"topic": "Splitting en cas de divorce",
          "last_modification": "01.01.2024",
          "etat": "01.01.2024"},
 "1.03": {"topic": "Bonifications pour tâches d’assistance",
          "last_modification": "01.01.2024",
          "etat": "01.01.2021"},
 "1.04": {"topic": "Explications concernant l’extrait du Compte Individuel (CI)",
          "last_modification": "01.01.2024",
          "etat": "01.01.2024"},
 "1.05": {"topic": "Explications concernant l’aperçu des comptes",
          "last_modification": "31.12.2021",
          "etat": "01.01.2015"},
 "1.07": {"topic": "Bonifications pour tâches éducatives",
          "last_modification": "01.01.2024",
          "etat": "01.01.2016"},
          }

In [93]:
tag = "Allgemeines"
filename = "1.07.f.json"
sitemap_url = "https://www.ahv-iv.ch/de/Sitemap-DE"
url = f"https://www.ahv-iv.ch/p/{filename.replace('.json', '')}"
with open(f"indexing/data/ahv_parsed/{tag}/{filename}", "r") as f:
    doc = json.load(f)

In [None]:
filename_m = re.sub(r'\.(d|i|f)\.json$', '', filename)
filename_m

In [None]:
mapping[filename_m]

In [96]:
to_csv = True

csv = []
for section in doc.keys():
    if doc[section].keys():
        for subsection in doc[section].keys():
            if doc[section][subsection]:
                text = str(mapping[filename_m]) + "\n" + section + "\n" + subsection + "\n" + doc[section][subsection]
                #print(text)
                #print("-----")
                if to_csv:
                    csv.append({
                        "url": url,
                        "text": text,
                        "source": sitemap_url,
                        "tag": tag
                    })

    if to_csv:
        pd.DataFrame(csv).to_csv(f"indexing/data/to_upsert/{tag}/{filename.replace('.json', '')}.csv", index=None)

In [None]:
embed = True
to_csv = True
upsert = False
csv = []

for i, doc in enumerate(clean_splits):

    n_tokens = len(tokenizer.encode(doc))
    if n_tokens > max_tokens:
        print(i)
        break
    else:
        text = doc
        url = documents["documents"][0].meta["url"]
        language = "de"
        # CAREFUL !!!!!!
        tag = "Familienzulagen"
        if to_csv:
            csv.append({
                "url": url,
                "text": text,
                "source": sitemap_url,
                "tag": tag
            })
        if upsert:
            document_service.upsert(db, DocumentCreate(url=url, text=text, source=sitemap_url, tag=tag), embed=embed)

if to_csv:
    pd.DataFrame(csv).to_csv("indexing/data/parsed/FZ_noheader_1.csv", index=None)

### 1. Fetch sections of ahv-iv.ch

In [None]:
from indexing.scraper import scraper
from indexing.pipelines.ahv import AHVParser
from bs4 import BeautifulSoup

parser = AHVParser()

In [None]:
sitemap_url = "https://www.ahv-iv.ch/de/Sitemap-DE"

sitemap = await scraper.fetch(sitemap_url)
url_list = parser.parse_urls(sitemap)
url_list

### 2. Select section to scrap

In [None]:
# Choose section to parse
topics = ["Allgemeines",
          "Beiträge-AHV-IV-EO-ALV",
          "Leistungen-der-AHV",
          "Leistungen-der-IV",
          "Ergänzungsleistungen-zur-AHV-und-IV",
          "Überbrückungsleistungen",
          "Leistungen-der-EO-MSE-EAE-BUE-AdopE",
          "Familienzulagen",
          "International",
          "Andere-Sozialversicherungen",
          #"Jährliche-Neuerungen"
         ]
topic = topics[-2]

In [None]:
section_to_scrap = url_list[[i for i, url in enumerate(url_list) if topic in url][0]]
section_to_scrap

In [None]:
content = scraper.scrap_urls([section_to_scrap])

#### --- OPTIONAL: Auto parsing for all other sections

In [None]:
# remove FZ PDFs (manually checked OK)
url_list.remove('https://www.ahv-iv.ch/de/Merkblätter-Formulare/Merkblätter/Familienzulagen')
print(url_list)
content = scraper.scrap_urls(url_list)

### 3. Get PDF URLs

In [None]:
soups = []
for page in content:
    soups.append(BeautifulSoup(page.data, features="html.parser"))

# Get PDF paths from each memento section
pdf_paths = []
for soup in soups:
    pdf_paths.extend(parser.get_pdf_paths(soup))

# Scrap PDFs from each memento section
pdf_urls = ["https://www.ahv-iv.ch" + pdf_path for pdf_path in pdf_paths]

# Add "it", "fr" pdf paths
pdf_urls.extend([pdf_url.replace(".d", ".f") for pdf_url in pdf_urls])
pdf_urls.extend([pdf_url.replace(".d", ".i") for pdf_url in pdf_urls])

pdf_urls = list(set(pdf_urls))
len(pdf_urls)

In [None]:
pdf_urls

#### --- OPTIONAL: Filter docs by language

In [None]:
# keep only german docs
pdf_urls = [url for url in pdf_urls if url.endswith(".d")]
pdf_urls

In [None]:
# keep only french docs
pdf_urls = [url for url in pdf_urls if url.endswith(".f")]
pdf_urls

### 4. Scrap PDFs

In [None]:
content = scraper.scrap_urls(pdf_urls)

### 5. Custom PDF Parser

In [5]:
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import (
    LTTextContainer,
    LTChar,
    LTTextLine,
    LTTextLineHorizontal,
)
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import resolve1
import json
from itertools import groupby

ModuleNotFoundError: No module named 'pdfminer.high_level'

: 

In [None]:
def extract_urls(pdf_bytes):
    """
    Extracts URLs from a PDF byte stream in the order they appear.
    """
    urls = []

    parser = PDFParser(BytesIO(pdf_bytes))
    document = PDFDocument(parser)

    # Iterate through pages to extract annotations
    for page in PDFPage.create_pages(document):
        annotations = page.annots
        if annotations:
            annotations = resolve1(annotations)
            for annotation in annotations:
                annotation_data = resolve1(annotation)
                # Check if the annotation is a link
                if annotation_data.get('Subtype').name == 'Link':
                    action = annotation_data.get('A')
                    if action and action.resolve().get('S').name == 'URI':
                        uri = action.resolve().get('URI')
                        if uri:
                            urls.append(uri.decode('utf-8'))
    return urls

def process_paragraph(chars, url_list, url_index):
    """
    Processes a paragraph, replacing italic text with markdown links.

    Parameters:
    - chars: List of LTChar objects in the paragraph.
    - url_list: List of URLs extracted from the PDF.
    - url_index: Current index in the URL list.

    Returns:
    - A dictionary with updated text and url_index.
    """
    new_paragraph_parts = []

    # Group chars by their style (italic or regular)
    for is_italic, group in groupby(chars, key=lambda c: 'Italic' in c.fontname):
        text = ''.join(c.get_text() for c in group)
        text = ' '.join(text.split())  # Remove extra spaces
        if is_italic:  # Italic text
            # Get the next URL from the list if available
            if url_index < len(url_list):
                url = url_list[url_index]
                url_index += 1
                # Replace italic text with markdown link
                new_paragraph_parts.append(f"[{text}]({url})")
            else:
                # No more URLs; keep italic text as is
                new_paragraph_parts.append(text)
        else:
            new_paragraph_parts.append(text)

    new_paragraph = ' '.join(new_paragraph_parts)
    # Remove any extra spaces in the final paragraph
    new_paragraph = ' '.join(new_paragraph.split())

    return {'text': new_paragraph, 'url_index': url_index}

def clean_subsections(extraction):
    """
    Cleans up subsection keys and joins paragraph lists into strings.

    Parameters:
    - extraction: The nested dictionary of extracted sections and subsections.

    Returns:
    - extraction: The cleaned and updated nested dictionary.
    """
    # Iterate over each section and its subsections
    for section, subsections in extraction.items():
        keys_to_update = {}
        keys_to_delete = []
        for key in subsections:
            # Strip leading numbers and whitespace from subsection keys
            new_key = key.lstrip("0123456789 ").strip()
            if new_key:
                keys_to_update[key] = new_key
            else:
                keys_to_delete.append(key)
        # Update subsection keys outside the loop to avoid modifying the dict during iteration
        for old_key, new_key in keys_to_update.items():
            subsections[new_key] = subsections.pop(old_key)
        # Delete empty keys
        for key in keys_to_delete:
            subsections.pop(key)

    # Join paragraph lists into single strings
    for subsections in extraction.values():
        for sub_key in subsections:
            if isinstance(subsections[sub_key], list):
                subsections[sub_key] = ' '.join(subsections[sub_key])
                # Remove extra spaces in the concatenated paragraphs
                subsections[sub_key] = ' '.join(subsections[sub_key].split())

    return extraction

def extract(pdf_bytes, save_json=False):
    """
    Extracts structured text from a PDF byte stream and replaces italic text with markdown links.
    Handles multiline sections and subsections regardless of the starting character's case.

    Parameters:
    - pdf_bytes: Bytes of the PDF file.
    - save_json: Boolean flag to save the extraction result as a JSON file.

    Returns:
    - extraction: A nested dictionary containing sections, subsections, and paragraphs.
    """
    pdf_stream = BytesIO(pdf_bytes)

    extraction = {}
    section = ""
    subsection = ""
    current_section_lines = []
    current_subsection_lines = []

    # Extract URLs in order
    with open('sources/pdf_urls.json', 'r') as file:
        pdf_urls = json.load(file)
    pdf_urls = pdf_urls[:20] + pdf_urls[460:480] + pdf_urls[-20:]
    url_list = [item['url'] for item in pdf_urls]
    url_index = 0  # To keep track of the current URL

    # Iterate over pages in the PDF
    for page_layout in extract_pages(pdf_stream):
        page_number = page_layout.pageid

        # Skip the first page
        if page_number == 1:
            continue

        # Iterate over elements in the page layout
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # For each text line or character within the text container
                for obj in element:
                    # Handle cases where obj is a text line or character
                    if isinstance(obj, (LTTextLine, LTTextLineHorizontal)):
                        text_line = obj
                    elif isinstance(obj, LTChar):
                        # Wrap the character in a list to process it as a single-character line
                        text_line = [obj]
                    else:
                        continue  # Skip other types

                    # Extract characters from the text line
                    if isinstance(text_line, list):
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    else:
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    if not chars:
                        continue  # Skip if there are no characters

                    # Get character sizes and fonts
                    char_sizes = [char.size for char in chars]
                    char_fonts = [
                        char.graphicstate.ncolor
                        for char in chars
                        if hasattr(char.graphicstate, 'ncolor')
                    ]

                    # Define thresholds and font colors for different text types
                    text_styles = {
                        'section': {'size': 12.5, 'color': [1]},
                        'subsection': {'size': 11, 'color': [0, 0, 0, 1]},
                        'paragraph': {'size': 10, 'color': [0, 0, 0, 1]},
                        'break': {'size': 14, 'color': [0, 0, 0, 1]}
                    }

                    # Function to check if text line matches a style
                    def matches_style(style):
                        return (
                            all(abs(size - style['size']) < 0.1 for size in char_sizes) and
                            all(font == style['color'] for font in char_fonts)
                        )

                    # Check for break condition
                    if matches_style(text_styles['break']):
                        # Process any pending subsection
                        if current_subsection_lines:
                            subsection_text = ' '.join(current_subsection_lines)
                            if section:
                                if subsection_text not in extraction[section]:
                                    extraction[section][subsection_text] = []
                                subsection = subsection_text
                            current_subsection_lines = []

                        # Process any pending section
                        if current_section_lines:
                            section_text = ' '.join(current_section_lines)
                            section_text = ' '.join(section_text.split())  # Remove extra spaces
                            extraction[section_text] = {"main": []}
                            section = section_text
                            current_section_lines = []

                        # Clean up
                        extraction = clean_subsections(extraction)
                        if save_json:
                            with open('./extraction.json', 'w', encoding="utf-8") as fp:
                                json.dump(extraction, fp, ensure_ascii=False)
                        # Reset variables to start processing next section
                        section = ""
                        subsection = ""
                        continue  # Continue processing the next lines

                    # Check if the text line is a section header
                    if matches_style(text_styles['section']):
                        # Append the text line to current_section_lines
                        text_line_text = ''.join(char.get_text() for char in chars)
                        text_line_text = ' '.join(text_line_text.split())  # Remove extra spaces
                        current_section_lines.append(text_line_text)
                        continue

                    else:
                        # If current_section_lines is not empty, process the section
                        if current_section_lines:
                            # Process any pending subsection
                            if current_subsection_lines:
                                subsection_text = ' '.join(current_subsection_lines)
                                if section:
                                    if subsection_text not in extraction[section]:
                                        extraction[section][subsection_text] = []
                                    subsection = subsection_text
                                current_subsection_lines = []

                            # Join the collected section lines to form the section title
                            section_text = ' '.join(current_section_lines)
                            section_text = ' '.join(section_text.split())  # Remove extra spaces
                            extraction[section_text] = {"main": []}
                            section = section_text
                            subsection = ""
                            current_section_lines = []

                    # Filter out subsection numbers (e.g., "[0, 0, 0, 0.3]")
                    filtered_fonts = [font for font in char_fonts if font != [0, 0, 0, 0.3]]

                    # Check if the text line is a subsection header
                    if (
                        all(abs(size - text_styles['subsection']['size']) < 0.1 for size in char_sizes) and
                        all(font == text_styles['subsection']['color'] for font in filtered_fonts)
                    ):
                        text_line_text = ''.join(char.get_text() for char in chars)
                        text_line_text = ' '.join(text_line_text.split())  # Remove extra spaces
                        current_subsection_lines.append(text_line_text)
                        continue

                    else:
                        # Process any pending subsection
                        if current_subsection_lines:
                            subsection_text = ' '.join(current_subsection_lines)
                            if section:
                                if subsection_text not in extraction[section]:
                                    extraction[section][subsection_text] = []
                                subsection = subsection_text
                            current_subsection_lines = []

                    # Check if the text line is a paragraph
                    if matches_style(text_styles['paragraph']):
                        # If section is empty, skip processing the paragraph
                        if not section:
                            continue

                        # Process the paragraph and replace italic text with markdown links
                        paragraph = process_paragraph(
                            chars, url_list, url_index
                        )
                        url_index = paragraph['url_index']  # Update the URL index
                        paragraph_text = paragraph['text']

                        # Remove leading/trailing numbers and spaces
                        paragraph_text = paragraph_text.strip("0123456789 ").strip()
                        paragraph_text = ' '.join(paragraph_text.split())  # Remove extra spaces

                        # Append paragraph to the appropriate section and subsection
                        if subsection:
                            extraction[section][subsection].append(paragraph_text)
                        else:
                            extraction[section]["main"].append(paragraph_text)

        # After all lines are processed on the page, check for any pending subsection
        if current_subsection_lines:
            subsection_text = ' '.join(current_subsection_lines)
            if section:
                if subsection_text not in extraction[section]:
                    extraction[section][subsection_text] = []
                subsection = subsection_text
            current_subsection_lines = []

        # Also check for any pending section
        if current_section_lines:
            section_text = ' '.join(current_section_lines)
            section_text = ' '.join(section_text.split())  # Remove extra spaces
            extraction[section_text] = {"main": []}
            section = section_text
            current_section_lines = []

    # Clean and finalize the extraction
    extraction = clean_subsections(extraction)
    if save_json:
        with open('./extraction.json', 'w', encoding="utf-8") as fp:
            json.dump(extraction, fp, ensure_ascii=False)
    return extraction


In [None]:
i = 8
pdf_bytes = content[i].data
content[i].meta

In [None]:
extraction = extract(pdf_bytes, save_json=True)

In [None]:
with open('./extraction.json', 'r', encoding="utf-8") as file:
    extraction = json.load(file)

print(json.dumps(extraction,
                 sort_keys=False,
                 indent=4,
                 ensure_ascii=False))

### Extract all content by topic

In [None]:
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfcolor import PDFColorSpace, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK, LITERAL_DEVICE_GRAY
from pdfminer.pdffont import PDFFontError

class TextPropertyDevice(PDFDevice):
    def __init__(self, rsrcmgr):
        super().__init__(rsrcmgr)
        self.characters = []
        self.current_color_space = None

    def render_string(self, textstate, seq, ncs, graphicstate):
        font = textstate.font
        fontsize = textstate.fontsize
        fill_color = graphicstate.ncolor  # Non-stroking color
        colorspace = ncs
        color_values = self.get_color_values(fill_color, colorspace)
        for obj in seq:
            if isinstance(obj, (int, float)):
                # Handle text spacing adjustments
                continue
            else:
                # Decode the text
                try:
                    text = font.decode(obj)
                except PDFFontError:
                    # Handle font decoding errors
                    continue
                for c in text:
                    self.characters.append({
                        'char': c,
                        'fontname': font.basefont,
                        'size': fontsize,
                        'fill_color': color_values,
                        # Background color extraction would require additional processing
                    })

    def get_color_values(self, color, colorspace):
        # Attempt to convert the color to RGB
        try:
            if isinstance(colorspace, str):
                colorspace_name = colorspace
            elif hasattr(colorspace, 'name'):
                colorspace_name = colorspace.name
            else:
                colorspace_name = str(colorspace)

            if colorspace_name == LITERAL_DEVICE_RGB:
                r, g, b = color
                return {'r': r, 'g': g, 'b': b}
            elif colorspace_name == LITERAL_DEVICE_CMYK:
                c, m, y, k = color
                return {'c': c, 'm': m, 'y': y, 'k': k}
            elif colorspace_name == LITERAL_DEVICE_GRAY:
                gray = color[0]
                return {'gray': gray}
            else:
                # For other color spaces, attempt to get RGB values
                cs = PDFColorSpace(colorspace)
                rgb = cs.get_rgb(color, None)
                if rgb:
                    r, g, b = rgb
                    return {'r': r, 'g': g, 'b': b}
                else:
                    return {'color': color}
        except Exception as e:
            # Handle cases where color conversion fails
            return {'color': color}

def extract_text_properties(pdf_bytes):
    from io import BytesIO
    parser = PDFParser(BytesIO(pdf_bytes))
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextPropertyDevice(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    for c in device.characters:
        print(f"Character: {c['char']}, Font: {c['fontname']}, Size: {c['size']}, Color: {c['fill_color']}")




In [None]:
# Example usage:

extract_text_properties(pdf_bytes)

In [None]:
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal

def print_char_sizes_and_colors(pdf_bytes):
    """
    Iterates over all characters in the PDF and prints their size and color.

    Parameters:
    - pdf_bytes: Bytes of the PDF file.
    """
    pdf_stream = BytesIO(pdf_bytes)

    # Iterate over pages in the PDF
    for page_layout in extract_pages(pdf_stream):
        page_number = page_layout.pageid
        print(f"Page {page_number}")

        # Iterate over elements in the page layout
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # For each text line or character within the text container
                for obj in element:
                    # Handle cases where obj is a text line or character
                    if isinstance(obj, (LTTextLine, LTTextLineHorizontal)):
                        text_line = obj
                    elif isinstance(obj, LTChar):
                        # Wrap the character in a list to process it as a single-character line
                        text_line = [obj]
                    else:
                        continue  # Skip other types

                    # Extract characters from the text line
                    if isinstance(text_line, list):
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    else:
                        chars = [char for char in text_line if isinstance(char, LTChar)]
                    if not chars:
                        continue  # Skip if there are no characters

                    # For each character, print size and color
                    for char in chars:
                        size = char.size
                        fontname = char.fontname
                        ncolor = getattr(char.graphicstate, 'ncolor', None)
                        text = char.get_text()
                        #if ncolor == [0, 0, 0, 0] and fontname == "LBNQWH+FrutigerLTStd-Bold":
                        print(f"Character: '{text}' Size: {size} Font: {fontname} Color: {ncolor}")


In [None]:
print_char_sizes_and_colors(pdf_bytes)

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.layout import LAParams, LTPage, LTTextBox, LTTextLine, LTChar, LTFigure, LTRect
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage

def extract_text_properties(pdf_bytes):
    from io import BytesIO
    parser = PDFParser(BytesIO(pdf_bytes))
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        layout = device.get_result()
        # Lists to hold characters and filled rectangles
        chars = []
        rects = []
        parse_layout(layout, chars, rects)
        for c in chars:
            char_bbox = c.bbox
            char_bg_color = None
            for rect in rects:
                rect_bbox = rect['bbox']
                if bbox_overlap(char_bbox, rect_bbox):
                    char_bg_color = rect['fill_color']
                    break  # Assuming the topmost rectangle is the background
            print(f"Character: '{c.get_text()}', Font: {c.fontname}, Size: {c.size}, "
                  f"Color: {c.non_stroking_color}, Background Color: {char_bg_color}")

def parse_layout(layout_obj, chars, rects):
    for obj in layout_obj:
        if isinstance(obj, LTChar):
            chars.append(obj)
        elif isinstance(obj, LTRect):
            # Only consider filled rectangles
            if obj.non_stroking_color is not None:
                rects.append({
                    'bbox': obj.bbox,
                    'fill_color': obj.non_stroking_color
                })
        elif isinstance(obj, (LTTextBox, LTTextLine, LTFigure, LTPage)):
            parse_layout(obj, chars, rects)

def bbox_overlap(bbox1, bbox2):
    # Determine if two bounding boxes overlap
    x0_1, y0_1, x1_1, y1_1 = bbox1
    x0_2, y0_2, x1_2, y1_2 = bbox2
    return not (x1_1 <= x0_2 or x1_2 <= x0_1 or y1_1 <= y0_2 or y1_2 <= y0_1)

extract_text_properties(pdf_bytes)


In [None]:
import fitz  # PyMuPDF

def get_text_in_filled_bboxes(pdf_path):
    """
    Extracts text from bounding boxes that have a fill color and returns the fill color in RGB.

    Parameters:
    - pdf_path: Path to the PDF file.

    Returns:
    - A list of dictionaries containing page number, fill color, bounding box, and extracted text.
    """
    doc = fitz.open(pdf_path)
    results = []

    for page_number, page in enumerate(doc, 1):
        # Get drawing objects on the page
        drawables = page.get_drawings()

        for d in drawables:
            if d['fill'] is not None:
                fill_color = d['fill']
                # Convert fill color from floats (0-1) to RGB integers (0-255)
                rgb_color = tuple(int(c * 255) for c in fill_color)

                # Get the bounding box of the filled shape
                rect = fitz.Rect(d['bbox'])
                # Expand the rectangle slightly to ensure all text is captured
                expanded_rect = rect + (-1, -1, 1, 1)

                # Extract text within the bounding box
                text = page.get_text("text", clip=expanded_rect)

                # Append the information to the results list
                results.append({
                    'page_number': page_number,
                    'fill_color_rgb': rgb_color,
                    'bounding_box': rect,
                    'text': text.strip()
                })

    return results

# Example usage:
pdf_path = '2.03_i.pdf'
filled_texts = get_text_in_filled_bboxes(pdf_path)

# Print the extracted information
for item in filled_texts:
    print(f"Page {item['page_number']}, Fill Color RGB: {item['fill_color_rgb']}")
    print(f"Bounding Box: {item['bounding_box']}")
    print(f"Text within the filled bounding box:\n{item['text']}\n")


In [None]:
import fitz  # PyMuPDF

doc = fitz.open('2.03_i.pdf')

red_rects = []

for page_number, page in enumerate(doc, 1):

    if page_number == 1:
        continue

    # Get drawing objects on the page
    drawables = page.get_drawings()

    for d in drawables:
        if d['fill'] is not None:
            fill_color = d['fill']
            print(page_number, "---", fill_color)
            # Check if the fill color is red (RGB: 1, 0, 0)
            if fill_color == (1, 0, 0):
                rect = fitz.Rect(d['bbox'])
                red_rects.append((page_number, rect))

In [None]:
red_rects

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTLine, LTFigure

def extract_table_elements(pdf_path):
    with open(pdf_path, 'rb') as file:
        # Create resource manager and page interpreter
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.get_pages(file):
            interpreter.process_page(page)
            layout = device.get_result()

            # Iterate through layout objects
            for element in layout:
                if isinstance(element, LTTextBox):
                    # Process text box (e.g., get text, coordinates)
                    print(f'Text: {element.get_text()}')
                elif isinstance(element, LTLine):
                    # Process line element (e.g., position, length)
                    print(f'Line: {element.x0, element.y0, element.x1, element.y1}')
                elif isinstance(element, LTFigure):
                    # Process figures or complex graphics
                    pass

extract_table_elements("1.04_m.pdf")

In [None]:
import pdfplumber

def extract_tables_with_pdfplumber(pdf_bytes):
    pdf_stream = BytesIO(pdf_bytes)
    with pdfplumber.open(pdf_stream) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            print(f"\n---Page {page_number}")
            tables = page.extract_tables()
            #for table_number, table in enumerate(tables, start=1):
            #    print(f"Table {table_number}")
            #    for row in table:
            #        print('\t'.join(str(cell) if cell is not None else '' for cell in row))
            if page_number == 4:
                return tables


In [None]:
tables = extract_tables_with_pdfplumber(pdf_bytes)
tables

In [None]:
tables

In [None]:
tables[0]

In [None]:
from io import BytesIO
from pdfminer.high_level import extract_pages
from pdfminer.layout import (
    LTPage,
    LTTextContainer,
    LTChar,
    LTTextLine,
    LTTextLineHorizontal,
)

def extract_tables_from_pdf_bytes(pdf_bytes):
    """
    Extracts and prints table data from a PDF byte stream using pdfminer.six.

    Parameters:
    - pdf_bytes: Bytes of the PDF file.
    """
    pdf_stream = BytesIO(pdf_bytes)

    # For each page in the PDF
    for page_number, page_layout in enumerate(extract_pages(pdf_stream), start=1):
        print(f"\n---Page {page_number}")

        # Collect text elements with their positions
        text_elements = []

        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line, (LTTextLine, LTTextLineHorizontal)):
                        line_text = ''
                        # Collect characters in the text line
                        for char in text_line:
                            if isinstance(char, LTChar):
                                line_text += char.get_text()

                        # Append text line with position info
                        x0, y0, x1, y1 = text_line.bbox
                        text_elements.append({
                            'text': line_text.strip(),
                            'x0': x0,
                            'x1': x1,
                            'y0': y0,
                            'y1': y1,
                        })

        # Group text elements into rows based on y-coordinate with a tolerance
        rows = []
        tolerance = 5  # Adjust this value based on your PDF's characteristics
        for element in text_elements:
            placed = False
            for row in rows:
                if abs(element['y0'] - row['y']) <= tolerance:
                    row['elements'].append(element)
                    placed = True
                    break
            if not placed:
                # Start a new row
                rows.append({'y': element['y0'], 'elements': [element]})

        # Sort rows by y-coordinate (from top to bottom)
        rows.sort(key=lambda r: -r['y'])

        # For each row, sort elements by x-coordinate (from left to right)
        for row in rows:
            sorted_elements = sorted(row['elements'], key=lambda e: e['x0'])
            row_text = [elem['text'] for elem in sorted_elements if elem['text']]
            # Print the row as tab-separated values
            print('|' + '|'.join(row_text) + '|')


In [None]:
extract_tables_from_pdf_bytes(pdf_bytes)

In [None]:
import tabula
import pandas as pd

def extract_tables_tabula(pdf_path):
    # Read tables from PDF into a list of DataFrames
    dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

    print(f"Total tables extracted: {len(dfs)}")

    # Iterate over DataFrames
    for i, df in enumerate(dfs):
        print(f"\nTable {i + 1}")
        print(df)

        # Optionally, save the DataFrame to CSV or JSON
        #df.to_csv(f'table_{i + 1}.csv', index=False)
        # df.to_json(f'table_{i + 1}.json', orient='records')

    return dfs

In [None]:
# Usage
pdf_path = '2.03_i.pdf'
dfs = extract_tables_tabula(pdf_path)

In [None]:
len(dfs)

In [None]:
dfs[3]

In [None]:
print(dfs[2].to_markdown())

In [None]:
dfs[2].to_json()

### TEST: gpt-4o OCR VLM

In [None]:
import os
import ast
from dotenv import load_dotenv

from openai import OpenAI
from openai.types.beta.threads.message_create_params import (
    Attachment,
    AttachmentToolFileSearch,
)

In [None]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)

In [None]:
filename = "1.04_m.pdf"
prompt = """You have been provided with a PDF. Extract all tables in json format as A LIST OF TUPLES [(page_n, table_json), etc.].
CAREFULLY EXTRACT TABLE HEADERS WHICH MIGHT BE COMPLICATED.
OUTPUT ONLY A LIST OF TUPLES"""

client = OpenAI(api_key=OPENAI_API_KEY)

pdf_assistant = client.beta.assistants.create(
    model="gpt-4o",
    description="An assistant to extract the contents of PDF files.",
    tools=[{"type": "file_search"}],
    name="PDF assistant",
)

# Create thread
thread = client.beta.threads.create()

file = client.files.create(file=open(filename, "rb"), purpose="assistants")

# Create assistant
client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    attachments=[
        Attachment(
            file_id=file.id, tools=[AttachmentToolFileSearch(type="file_search")]
        )
    ],
    content=prompt,
)

# Run thread
run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=pdf_assistant.id, timeout=1000
)

if run.status != "completed":
    raise Exception("Run failed:", run.status)

messages_cursor = client.beta.threads.messages.list(thread_id=thread.id)
messages = [message for message in messages_cursor]

In [None]:
# Output text
res_txt = messages[0].content[0].text.value
res_txt

In [None]:
try:
    extraction = ast.literal_eval(res_txt.replace("```python\n", "").replace("\n```", ""))
except SyntaxError as e:
    extraction = ast.literal_eval(res_txt.replace("```json\n", "").replace("\n```", ""))
extraction

In [None]:
extraction.keys()

In [None]:
tables = [
    (4, '{"columns":[["Jahr","Arbeitnehmende / Nichterwerbstätige","Selbständigerwerbende"],["1948-1968","300.00","600.00"],["1969-1972","800.00","1 540.00"],["1973-1975","1 000.00","2 000.00"],["1976-1978","1 000.00","1 950.00"],["1979-1981","2 000.00","3 960.00"],["1982-1985","2 500.00","4 940.00"],["1986-1989","3 000.00","5 930.00"],["1990-1991","3 208.00","6 334.00"],["1992-1995","3 564.00","7 038.00"],["1996-2002","3 861.00","7 623.00"],["2003-2006","4 208.00","8 307.00"],["2007-2008","4 406.00","8 698.00"],["2009-2010","4 554.00","8 991.00"],["2011-2012","4 612.00","9 094.00"],["2013-2018","4 667.00","9 333.00"],["2019","4 702.00","9 405.00"],["2020","4 701.00","9 402.00"],["2021-2022","4 747.00","9 494.00"],["ab 2023","4 851.00","9 701.00"]]}'),
    (5, '{"columns":[["Verjährung",""],["9","Können die Beiträge rückwirkend entrichtet oder"],["","eingefordert werden?"],["Nein. Werden Beiträge nicht innert fünf Jahren nach Ablauf des Kalender-",""],["jahres, für das sie geschuldet sind, durch Verfügung geltend gemacht, so",""],["können sie nicht mehr eingefordert oder entrichtet werden. Beitragslücken",""],["können unter Umständen zu einer späteren Rentenkürzung führen.",""],["Beanstandung der Eintragung",""],["10","Kann ich eine Berichtigung verlangen?"],["Sie können innert 30 Tagen nach der Zustellung des Kontoauszugs bei der",""],["Ausgleichskasse, die das beanstandete Konto führt, eine Berichtigung ver-",""],["langen, wenn Sie die Richtigkeit der Einträge nicht anerkennen. Den Ent-",""],["scheid  über  das  Berichtigungsbegehren  fällt  die  Ausgleichskasse  in  Form",""],["einer Kassenverfügung.",""]]}'),
    (6, '{"columns":[["Auskünfte und weitere",""],["Informationen",""],["Dieses  Merkblatt  vermittelt  nur  eine  Übersicht.  Für  die  Beurteilung",""],["von  Einzelfällen  sind  ausschliesslich  die  gesetzlichen Bestimmungen",""],["massgebend. Die Ausgleichskassen und ihre Zweigstellen geben gerne",""],["Auskunft.  Ein  Verzeichnis  aller  Ausgleichskassen  finden  Sie  unter",""],["www.ahv-iv.ch.",""],["Die Zivilstandsbezeichnungen haben auch die folgende Bedeutung:",""],["•","Ehe/Heirat: eingetragene Partnerschaft"],["•","Scheidung: gerichtliche Auflösung der Partnerschaft"],["•","Verwitwung: Tod der eingetragenen Partnerin / des eingetragenen"],["","Partners"],["Herausgegeben von der Informationsstelle AHV/IV in Zusammenarbeit",""],["mit dem Bundesamt für Sozialversicherungen.",""],["Ausgabe  November  2023.  Auch  auszugsweiser  Abdruck  ist  nur  mit",""],["schriftlicher Einwilligung der Informationsstelle AHV/IV erlaubt.",""],["Dieses  Merkblatt  kann  bei  den  Ausgleichskassen  und  deren  Zweig-",""],["stellen sowie den IV-Stellen bezogen werden. Bestellnummer 1.04. Es",""],["ist ebenfalls unter www.ahv-iv.ch verfügbar.",""]]}'),
    (7, '{"columns":[["Explications concernant l’extrait",""],["du Compte Individuel (CI)",""],["En bref",""],["L’extrait de compte indique tous les revenus et bonifications pour tâches",""],["d’assistance communiqués aux caisses de compensation.",""],["Les  revenus  de  l’année  courante  ne  sont  pas  encore  inscrits  et  ceux  de",""],["l’année précédente peuvent également ne pas l’être si la déclaration de sa-",""],["laire correspondante n’a pas encore été traitée. Les inscriptions au Compte",""],["Individuel  des  indépendants  et  des  personnes  sans  activité  lucrative  ne",""],["peuvent  être  effectuées  qu’une  fois  les  cotisations  définitivement  fixées.",""],["Il est de ce fait possible que des inscriptions manquent bien que les cotisa-",""],["tions AVS/AI/APG aient été payées.",""],["Une vidéo explicative vous montre comment demander, de manière simple",""],["et rapide, un extrait de votre compte individuel: www.ahv-iv.ch/r/ci",""],["Code de revenu",""],["1","Quelle est la signification du code de revenu ?"],["Le code de revenu se trouve dans la colonne 2 de l’extrait de compte ; s’il",""],["est précédé d’un chiffre, celui-ci indique une correction.",""]]}'),
    (8, '{"columns":[["Mois de cotisation"],["2 Qu’entend-on par mois de cotisation ?"],["Les mois de cotisations reflètent la durée de l’activité et sont inscrits dans"],["la colonne 4 et numérotés de 1 à 12. Ils sont enregistrés pour les étrangers"],["depuis 1969 et pour les Suisses depuis 1979."],["3 Qu’entend-on par inscriptions particulières ?"],["InscriptionSignification"],["66Début ou fin de la durée de cotisation indéterminés"],["77Bénéfices en capital et indemnités pour travail con-"],["sacré à la famille (rémunération des domestiques, des"],["moines et des religieuses, ainsi que dons)"],["99Revenu (et non durée de cotisation) modifié après"],["coup"],["Revenu"],["4 Où les revenus sont-ils inscrits ?"],["Les revenus sont inscrits dans la colonne 6. Les inscriptions correspondent"],["aux revenus ou aux prestations d’assurance sur lesquels des cotisations ont"],["été perçues."],["5 Quel revenu est inscrit pour les personnes sans"],["activité lucrative ?"],["Le revenu inscrit pour les personnes sans activité lucrative est celui qui cor-"],["respond aux cotisations AVS/AI/APG versées."],["6 Où sont inscrites les bonifications pour"],["tâches d’assistance ?"],["Seul le droit aux bonifications pour tâches d’assistance est inscrit dans la"],["colonne 3. Le montant des bonifications sera fixé au moment


### TEST: gemini OCR VLM

In [None]:
import google.generativeai as genai

In [None]:
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", None)
genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel('gemini-1.5-pro')

In [None]:
pwd

In [None]:
file_path = "./1.01_f.pdf"

sample_file = genai.upload_file(path=file_path, display_name="1.01_f.pdf")

In [None]:
response = model.generate_content([sample_file,
                                   "Extract all sections (red headers) and following text paragraph in JSON format {section: paragraph}. If a paragraph contains subsection headers (usually bold and numbered) \
                                   create a nested dict (eg. {section: {subsection: paragraph}}"])

In [None]:
response

In [None]:
print(response.text)

In [None]:
print(response.text.replace("```json\n", "").replace("\n```", ""))

### TEST: PDF TO HTML

In [None]:
from pdfminer.high_level import extract_text
import fitz  # PyMuPDF
import io

def pdf_bytes_to_html(pdf_bytes):
    # Extract text using pdfminer
    text = extract_text(io.BytesIO(pdf_bytes))

    # Create an HTML template
    html_content = f"""
    <html>
    <head>
    <style>
        body {{ font-family: Arial, sans-serif; }}
        p {{ margin: 0; padding: 5px; }}
    </style>
    </head>
    <body>
    """

    # Process text
    for line in text.split('\n'):
        if line.strip():  # Skip empty lines
            html_content += f"<p>{line}</p>"

    html_content += "</body></html>"

    # Handling images and layout using PyMuPDF
    # Save the PDF to a temporary file and open it with PyMuPDF
    temp_pdf = io.BytesIO(pdf_bytes)
    doc = fitz.open(stream=temp_pdf, filetype="pdf")

    # Extract images and add them to HTML
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_name = f"image_page{page_num + 1}_{img_index}.{image_ext}"
            # Save the image locally or use base64 encoding to embed it directly in HTML

            # For simplicity, this example assumes you save images and link them
            with open(image_name, "wb") as image_file:
                image_file.write(image_bytes)

            # Add image reference to HTML
            html_content += f'<img src="{image_name}" alt="Page {page_num + 1} Image {img_index}"><br>'

    html_content += "</body></html>"

    # Close the PDF document
    doc.close()

    return html_content

In [None]:
text = extract_text(io.BytesIO(content[0].data))

In [None]:
print(text)

##### -------- END TEST

In [None]:
documents = parser.convert_to_documents(content)

# Remove empty documents
documents = parser.remove_empty_documents(documents["documents"])

# Clean documents
documents = parser.clean_documents(documents)

documents

In [None]:
len(documents["documents"])

In [None]:
print(documents["documents"][0].content)
print(documents["documents"][0].meta["url"])

### 5. Chunk documents by subtopic header

In [None]:
import re

In [None]:
text = documents["documents"][1].content

#### TO DO: dict with {pdf_name: sections}

In [None]:
sections = {
    "https://www.ahv-iv.ch/p/1.01.d": ["Auf einen Blick", "Antrag für den Kontoauszug", "Beanstandung der Eintragungen"],
    "https://www.ahv-iv.ch/p/1.02.d": [],
    "https://www.ahv-iv.ch/p/1.03.d": [],
    "https://www.ahv-iv.ch/p/1.04.d": [],
    "https://www.ahv-iv.ch/p/1.05.d": [],
    "https://www.ahv-iv.ch/p/1.07.d": [],
    "https://www.ahv-iv.ch/p/1.01.f": [],
    "https://www.ahv-iv.ch/p/1.02.f": [],
    "https://www.ahv-iv.ch/p/1.03.f": [],
    "https://www.ahv-iv.ch/p/1.04.f": [],
    "https://www.ahv-iv.ch/p/1.05.f": [],
    "https://www.ahv-iv.ch/p/1.07.f": [],
    "https://www.ahv-iv.ch/p/1.01.i": [],
    "https://www.ahv-iv.ch/p/1.02.i": [],
    "https://www.ahv-iv.ch/p/1.03.i": [],
    "https://www.ahv-iv.ch/p/1.04.i": [],
    "https://www.ahv-iv.ch/p/1.05.i": [],
    "https://www.ahv-iv.ch/p/1.07.i": [],
    "https://www.ahv-iv.ch/p/6.08.d": ["Auf einen Blick", "Anspruch", "Unterstellung", "Finanzierung", "Verfahren", "Auskünfte und weitere Informationen"],
    "https://www.ahv-iv.ch/p/6.09.d": ["Auf einen Blick", "Anspruch", "Anspruchskonkurrenz und Differenzzahlung bei derselben Person", "Anspruchskonkurrenz und Differenzzahlung bei verschiedenen Personen", "Beispiele zur Anspruchskonkurrenz, wenn FamZG und FLG betroffen sind", "Finanzierung", "Verfahren"],
}

In [None]:
#sections = [
#    "Auf einen Blick",
#    "Anspruch",
#    "Unterstellung",
#    "Finanzierung",
#    "Verfahren",
#    "Auskünfte und weitere Informationen"
#]

sections = [
    "Auf einen Blick",
    "Anspruch",
    "Anspruchskonkurrenz und Differenzzahlung bei derselben Person",
    "Anspruchskonkurrenz und Differenzzahlung bei verschiedenen Personen",
    "Beispiele zur Anspruchskonkurrenz, wenn FamZG und FLG betroffen sind",
    "Finanzierung",
    "Verfahren",
]

#sections = sections_608 + sections_609
#sections = list(set(sections))

# Construct regex pattern
patterns = [rf"[\n\x0c]?\d*{re.escape(section)}\n" for section in sections]
pattern = '|'.join(patterns)

splits = re.split(pattern, text)

len(splits)

In [None]:
splits_with_section = []

for split, sec in zip(splits[1:], sections):
    split = sec + "\n\n" + split
    splits_with_section.append(split)
    print(split)
    print("----------------------------")

#### Remove footer (Weitere Informationen)

In [None]:
footer = [r"\x0c12Auskünfte und weitere Informationen",
             r"Dieses Merkblatt vermittelt nur eine Übersicht.*"]

clean_splits = []
for split in splits_with_section:
    for pattern in footer:
        split = re.sub(pattern, '', split, flags=re.DOTALL)
        split = split.replace("12Auskünfte und weitere Informationen", "")
    clean_splits.append(split)


In [None]:
clean_splits

In [None]:
for split in clean_splits:
    print(split)
    print("-------------------")

In [None]:
# merge split 0 with all splits
#header = clean_splits[0]

#final_splits = []
#for split in clean_splits:
#    split_with_header = header + "\n\n" + split
#    final_splits.append(split_with_header)
#    print(split_with_header)
#    print("-------------------")

In [None]:
#for split in final_splits:
#    print(split)
#    print("----------------")

In [None]:
max_tokens = 8191
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:
embed = True
to_csv = True
upsert = False
csv = []

for i, doc in enumerate(clean_splits):

    n_tokens = len(tokenizer.encode(doc))
    if n_tokens > max_tokens:
        print(i)
        break
    else:
        text = doc
        url = documents["documents"][0].meta["url"]
        language = "de"
        # CAREFUL !!!!!!
        tag = "Familienzulagen"
        if to_csv:
            csv.append({
                "url": url,
                "text": text,
                "source": sitemap_url,
                "tag": tag
            })
        if upsert:
            document_service.upsert(db, DocumentCreate(url=url, text=text, source=sitemap_url, tag=tag), embed=embed)

if to_csv:
    pd.DataFrame(csv).to_csv("indexing/data/parsed/FZ_noheader_1.csv", index=None)

#### TO DO:
1. evaluate retrieval + on this chunking/parsing
2. evaluate retrieval on **adding short topic/subtopic summary as header** (--> see medium article)

# Continue with non FZ sections

In [None]:
chunks = documents

In [None]:
tags = [url.split("/")[-1] for url in url_list]
tags

In [None]:
tags = {
    "Allgemeines": ["1.01", "1.02", "1.03", "1.04", "1.05", "1.07"],
    "Beiträge-AHV-IV-EO-ALV": ["2.01", "2.02", "2.03", "2.04", "2.05", "2.06", "2.07", "2.08", "2.09", "2.10", "2.11", "2.12"],
    "Leistungen-der-AHV": ["31", "3.01", "3.02", "3.03", "3.04", "3.05", "3.06", "3.07", "3.08"],
    "Leistungen-der-IV": ["4.01", "4.02", "4.03", "4.04", "4.05", "4.06", "4.07", "4.08", "4.09", "4.11", "4.12", "4.13", "4.14", "4.15", "4.16"],
    "Ergänzungsleistungen-zur-AHV-und-IV": ["5.01", "5.02", "51", "52"],
    "Überbrückungsleistungen": ["5.03"],
    "Leistungen-der-EO-MSE-EAE-BUE-AdopE": ["6.01", "6.02", "6.04", "6.10", "6.11"],
    "International": ["10.01", "10.02", "10.03", "11.01", "880", "890"],
    "Andere-Sozialversicherungen": ["6.05", "6.06", "6.07"],
    "Jährliche-Neuerungen": ["1.2024", "1.2023", "1.2021", "1.2020", "1.2019", "1.2016", "1.2015", "1.2014", "1.2013", "1.2012", "1.2011", "1.2009", "1.2008", "1.2007", "1.2005"],
}

In [None]:
def find_tag_key(tags, search_string):
    for key, values in tags.items():
        if search_string in values:
            return key
    return None

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:
from schemas.document import DocumentCreate

embed = True
max_tokens = 8192
long_docs = []

for i, doc in enumerate(chunks["documents"]):

    n_tokens = len(tokenizer.encode(doc.content))
    if n_tokens > max_tokens:
        print(i)
        long_docs.append(doc)
    else:
        text = doc.content
        url = doc.meta["url"]
        language = "fr"
        pdf_id = doc.meta["url"].split("/")[-1].replace(".f", "")
        tag = find_tag_key(tags, pdf_id)
        print(tag)
        document_service.upsert(db, DocumentCreate(url=url, text=text, source=sitemap_url, tag=tag), embed=embed)

# Long docs

In [None]:
len(long_docs)

### Evaluate RAG pipeline

# EVAL HERE

### Get all FZ docs (unchunked)

In [None]:
docs = document_service.get_all_documents(db)
len(docs)

In [None]:
for doc in docs:
    print(doc.text, doc.url)
    print("--------------------")

### Evaluate retrieval

- Is correct doc retrieved for FZ questions?

In [None]:
# load FZ questions
fz_eval = pd.read_csv("indexing/data/memento_eval_qa_FZ.csv")
fz_eval.head()

In [None]:
k=100

In [None]:
recall = {}

for question in fz_eval.question:
    request = RAGRequest(query=question)
    doc = processor.retrieve(db, request, language=None, tag=None, k=k)
    recall[question] = doc
    break

In [None]:
retrieval_recall = {}
for (question, doc), url in zip(recall.items(), fz_eval.url):
    #retrieval_recall[doc[0].url] = 1 if doc[0].url == url else 0
    retrieval_recall[question] = 1 if url.replace("www.", "") in [d.url for d in doc] else 0
    print(question)
    print("\n".join([d.url for d in doc]))
    print("----------------------")
    print(url)
    print("----------------------")
    print("----------------------")

In [None]:
sum(retrieval_recall.values())/len(retrieval_recall)

In [None]:
retrieval_recall

# Retrieval results

eak.admin.ch

avg recall
- TopKRetriever(k=1), text-embedding-ada-002: 0.375
- TopKRetriever(k=10), text-embedding-ada-002: 0.905
- **top_k_retriever(k=100), reranking(k=5), text-embedding-ada-002: 1**
- TopKRetriever(k=1), text-embedding-3-small: 0 --> NEED TO RE-EMBED
- TopKRetriever(k=10), text-embedding-3-small: 0.048 --> NEED TO RE-EMBED

ahv-iv

avg recall
- TopKRetriever(k=1), text-embedding-ada-002: 0.069
- TopKRetriever(k=10), text-embedding-ada-002: 0.483
- top_k_retriever(k=100), reranking(k=5), text-embedding-ada-002: 0.79
- - **top_k_retriever(k=100), reranking(k=10), text-embedding-ada-002: 0.897** --> need to solve large pdf chunking

### Make request

In [None]:
request = RAGRequest(query="hello")

# test
processor.retrieve(db, request, language=None, tag=None, k=1)

### Setup LLM client

In [None]:
llm_client.max_output_tokens = 10000

In [None]:
prompt = "Write a 10000 token poem"

In [None]:
messages = [{"role": "system", "content": prompt},]

# test
llm_client.generate(messages).choices[0].message.content

# LLM chunking

The idea is to prompt an LLM to semantically chunk documents. This approach diverges from the semantic chunking methodology where actual text embeddings are being optimized to be as similar as possible for chunks containing similar information, and dissimilar for chunks containing dissimilar information.

For each document, we chunk it into paragraphs and track the following:
- **text**: text chunk
- **url**: source url of the document
- **language**: language of the document
- **tag**: document topic
- **n_tokens**: number of tokens per chunk
- **parent_doc**: the url of the document from which this chunk originates

We compute token statistics according to the LLM model tokenizer (here `gpt-4o`, so `cl100k_base` from tiktoken) and only call the chunker LLM to semantically chunk documents over the mean token count across documents.

### Retrieve content

##### https://www.eak.admin.ch/eak/de/home.sitemap.xml

In [None]:
sitemap_url = "https://www.eak.admin.ch/eak/de/home.sitemap.xml"
embed = False
admin_indexer.splitter = None

In [None]:
# index admin data
await admin_indexer.index(sitemap_url, db, embed=embed)

In [None]:
# retrieve all raw documents
docs = document_service.get_all_documents(db)

In [None]:
len(docs)

### Compute token statistics

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:
tokens = {}

for doc in docs:
    tokens[doc.url] = {"n_tokens": len(tokenizer.encode(doc.text)),
                       "text": doc.text}

tokens_df = pd.DataFrame.from_dict(tokens, orient="index")
tokens_df.head()

In [None]:
token_stats = tokens_df.describe()
token_stats

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
tokens_df.plot(kind="bar", ax=ax)
plt.axhline(y=token_stats.loc["75%", "n_tokens"]+token_stats.loc["std", "n_tokens"], color='r', linestyle='--', linewidth=1)
plt.show()

In [None]:
long_docs = []

for i, row in tokens_df.iterrows():
    if row.n_tokens > token_stats.loc["75%", "n_tokens"]+token_stats.loc["std", "n_tokens"]:
        long_docs.append((row.name, row.text))

len(long_docs)

#### LLM chunker

In [None]:
prompt = """You are a highly advanced language model trained for the task of segmenting documents into meaningful and independent chunks
for Retrieval-Augmented Generation (RAG) purposes. Your goal is to process a provided document and split it into distinct chunks
that can be understood on their own. Each chunk should contain a self-contained idea or piece of information that is unrelated to
the other chunks.

Here’s how you should approach this task:

1. Chunk Identification: Carefully read through the document and identify potential breakpoints where a new, independent idea or topic begins.

2. Chunk Validation: Ensure that each identified chunk can be understood independently without requiring context from previous or subsequent chunks.

3. Chunk Creation: If a segment of the document can be split based on the criteria above, separate it into a distinct chunk. If not, do not split the text.

4. Output Format: Provide each chunk separated by "\n\n"

Remember, only create a chunk if the information it contains is unrelated to the other chunks and can be understood independently and
extract text chunks *AS IS*, without editing them.

You must try to create as large chunks as possible and ALL text must be present in the chunks.

DOCUMENT: {doc}

CHUNKS:"""

In [None]:
for doc in tqdm.tqdm(long_docs):


    messages = [{"role": "system", "content": prompt.format(doc=doc[1])},]
    res = llm_client.generate(messages).choices[0].message.content
    break

In [None]:
doc

In [None]:
len(tokenizer.encode(res))

In [None]:
print(res)

In [None]:
for chunk in res.split("\n\n"):
    print(chunk)
    print("--------_")