In [2]:
import re
import os
import uuid
import json
from pathlib import Path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTImage, LTFigure, LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

In [10]:
# Rensning af tekst
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
to_be_replaced = {
    "½": "1/2", "–": "-", "‘": "'", "’": "'", "…": "...", "₋": "-", "−": "-",
    "⓫": "11.", "⓬": "12.", "⓭": "13.", "⓮": "14.", "◦": "°",
    "❶": "1.", "❷": "2.", "❸": "3.", "❹": "4.", "❺": "5.",
    "❻": "6.", "❼": "7.", "❽": "8.", "❾": "9.", "❿": "10.",
    "\n": " ",
}

def clean(text):
    for char in to_be_removed:
        text = text.replace(char, "")
    for char, replacement in to_be_replaced.items():
        text = text.replace(char, replacement)
    text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
    text = text.replace(" ,", ",").replace(" .", ".")
    return re.sub(r"\s{2,}", " ", text)

def save_image(image_stream, image_folder):
    image_id = str(uuid.uuid4())
    image_path = os.path.join(image_folder, f"{image_id}.png")
    with open(image_path, 'wb') as f:
        f.write(image_stream)
    return image_id

def find_images(layout_obj):
    """Rekursivt find billeder i layout (også inde i LTFigure)."""
    images = []
    if isinstance(layout_obj, LTImage):
        images.append(layout_obj)
    elif hasattr(layout_obj, "_objs"):
        for obj in layout_obj._objs:
            images.extend(find_images(obj))
    return images

def extract_page_data(path, output_folder, min_font_size=12, footer_font_threshold=10,
                      header_footer_zone=0.8, bottom_footer_zone=0.2):

    filename = os.path.basename(path)
    base_name = os.path.splitext(filename)[0]
    image_output_folder = os.path.join(output_folder, "images", base_name)


    os.makedirs(image_output_folder, exist_ok=True)
    pages_json = []
    last_title = None
    


    with open(path, 'rb') as f:
        for page_number, page in enumerate(PDFPage.get_pages(f), start=1):
            resource_manager = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(resource_manager, laparams=laparams)
            interpreter = PDFPageInterpreter(resource_manager, device)
            interpreter.process_page(page)
            layout = device.get_result()

            page_width = layout.width
            page_height = layout.height

            elements = []
            image_ids = []

            for element in layout:
                # Tekst
                if isinstance(element, LTTextContainer):
                    for text_line in element:
                        line_text = text_line.get_text().strip()
                        if not line_text:
                            continue
                        font_sizes = [char.size for char in text_line if isinstance(char, LTChar)]
                        avg_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
                        x0, y0, x1, y1 = text_line.bbox
                        elements.append((y1, "text", avg_size, line_text, y0, y1))

                # Billeder (rekursivt)
                for img in find_images(element):
                    try:
                        stream = img.stream.get_rawdata()
                        image_id = save_image(stream, image_output_folder)
                        image_tag = f"<image-{image_id}>"
                        image_ids.append(image_id)
                        y0, y1 = img.bbox[1], img.bbox[3]
                        elements.append((y1, "image", 0, image_tag, y0, y1))
                    except Exception as e:
                        print(f"Kunne ikke gemme billede på side {page_number}: {e}")

            elements.sort(reverse=True, key=lambda x: x[0])

            main_lines = []
            footer_top_lines = []
            footer_bottom_lines = []

            # Titel
            title_candidates = [e for e in elements if e[1] == "text" and e[2] >= min_font_size]
            if title_candidates:
                title_element = max(title_candidates, key=lambda x: x[2])
                title = title_element[3]
                last_title = title
            else:
                title = last_title or "Untitled"
                title_element = None

            for elem in elements:
                y1, kind, size, content, y0, _ = elem
                if kind == "image":
                    main_lines.append(content)
                elif title_element and elem == title_element:
                    continue
                elif size >= footer_font_threshold:
                    main_lines.append(content)
                else:
                    if y1 >= page_height * header_footer_zone:
                        footer_top_lines.append(content)
                    elif y1 <= page_height * bottom_footer_zone:
                        footer_bottom_lines.append(content)
                    else:
                        main_lines.append(content)

            raw_text_with_tags = clean(" ".join(main_lines))
            text_for_embedding = re.sub(r"<image-[^>]+>", "", raw_text_with_tags).strip()
            footer_top = clean(" ".join(footer_top_lines))
            footer_bottom = clean(" ".join(footer_bottom_lines))

            if footer_bottom:
                section_name = footer_bottom[:-7].strip()
                page_str = footer_bottom.split()[-1].strip()
            else:
                section_name = ""
                page_str = str(page_number)

            same_exact = (text_for_embedding.strip() == title.strip())
            almost_same = (
                len(text_for_embedding.strip()) <= len(title.strip()) + 10
                and text_for_embedding.strip().startswith(title.strip())
            )
            if same_exact or almost_same:
                continue

            pages_json.append({
                "id": str(uuid.uuid4()),
                "source_file": filename,
                "client": filename[:-4],
                "page_number": page_number,
                "section_name": section_name,
                "title": title,
                "text_for_embedding": f"Section: {section_name}\nTitle: {title}\n\n{text_for_embedding}",
                "raw_text_with_tags": raw_text_with_tags,
                "image_ids": image_ids,
                "footer_top": footer_top
            })

    return pages_json

#Behandlingsfunktion for mappe
def process_folder(folder_path: str, output_dir: str = None, min_font_size: int = 19):
    """
    Behandler alle PDF-filer i folder_path og gemmer hver side som en selvstændig JSON-fil.
    Eksempel: "mailguide_p3.json"
    
    :param folder_path: Sti til mappen med PDF-filer.
    :param output_dir: (Valgfri) Output-mappe. Hvis None, bruges "./output".
    :param min_font_size: Fontstørrelse til extract_page_data.
    """
    folder = Path(folder_path)
    if not folder.is_dir():
        raise ValueError(f"{folder_path} er ikke en gyldig mappe")

    output_folder = Path(output_dir) if output_dir else Path("./output")
    output_folder.mkdir(parents=True, exist_ok=True)

    for pdf_file in folder.glob("*.pdf"):
        try:
            print(f"Behandler: {pdf_file.name}")
            pages_json = extract_page_data(str(pdf_file), output_folder, min_font_size=min_font_size)

            for page in pages_json:
                page_num = page.get("page_number", "NA")
                json_filename = f"{pdf_file.stem}_p{page_num}.json"
                output_path = output_folder / json_filename

                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(page, f, ensure_ascii=False, indent=2)

            print(f"Gemte {len(pages_json)} sider fra {pdf_file.name} som enkeltfiler.")

        except Exception as e:
            print(f"Fejl ved behandling af {pdf_file.name}: {e}")

In [None]:
output_dir = "./billede_test0707"
process_folder("./knowledgebase", output_dir)  #"./knowledgebase", "./json_output3")

# - outputdir
#  - - json 1
#  - - json 2 
#  - - image_folder


Uploader til cosmosDB

In [16]:
from azure.cosmos import CosmosClient, PartitionKey


In [14]:


# Cosmos DB adgang
COSMOS_ENDPOINT = "https://valeur-rag-cosmosdb.documents.azure.com:443/"
COSMOS_KEY = "YWjgvfmqFb2VOQSeSuXZ8bIRC5BuIcokcwTIBKezyQM94USh4jolCRNhgB4swz6gxEWNtDLb3NtZACDb3p9gzA=="
DATABASE_NAME = "knowledgebase_db"
CONTAINER_NAME = "kb_container"
PARTITION_KEY_FIELD = "/source_file"

# Forbind til Cosmos DB
client = CosmosClient(COSMOS_ENDPOINT, COSMOS_KEY)
db = client.get_database_client(DATABASE_NAME)
container = db.get_container_client(CONTAINER_NAME)

In [19]:
# Upload alle .json-filer i mappe
VECTOR_DIR = Path(output_dir)
for path in VECTOR_DIR.glob("*.json"):
    with open(path, "r", encoding="utf-8") as f:
        doc = json.load(f)

    # Partition key check
    pk = doc.get("source_file")
    if not isinstance(pk, str) or not pk.strip():
        print(f"❌ Ugyldig partition key i: {path.name}")
        continue

    # id-felt check
    if "id" not in doc or not isinstance(doc["id"], str) or not doc["id"].strip():
        doc["id"] = path.stem  # Unikt id baseret på filnavn

    try:
        # test serialisering – fanger fx datetime
        json.dumps(doc)

        print(f"🔍 {path.name} - source_file = {pk}, id = {doc['id']}")
        container.upsert_item(doc)
        print(f"✅ Uploaded: {path.name}")
    except Exception as e:
        print(f"❌ Fejl ved {path.name}: {e}")
        print(json.dumps(doc, indent=2))
        break
    finally:
        print("🚀 Upload forsøg færdig")
print("done")

🔍 Airtox_p1.json - source_file = Airtox.pdf, id = f7e03f24-b9ec-41b8-9fc4-252ad71fdca7
✅ Uploaded: Airtox_p1.json
🚀 Upload forsøg færdig
🔍 Airtox_p10.json - source_file = Airtox.pdf, id = f02f19d9-3165-4fd3-863f-a58450dc8261
✅ Uploaded: Airtox_p10.json
🚀 Upload forsøg færdig
🔍 Airtox_p11.json - source_file = Airtox.pdf, id = 67d231a3-02d6-4266-a5cd-1f5f215d961d
✅ Uploaded: Airtox_p11.json
🚀 Upload forsøg færdig
🔍 Airtox_p12.json - source_file = Airtox.pdf, id = fda46e25-e851-403f-b11c-49300b44b5e7
✅ Uploaded: Airtox_p12.json
🚀 Upload forsøg færdig
🔍 Airtox_p13.json - source_file = Airtox.pdf, id = f0fecb81-224e-41eb-a9f9-e3d019afdb4b
✅ Uploaded: Airtox_p13.json
🚀 Upload forsøg færdig
🔍 Airtox_p14.json - source_file = Airtox.pdf, id = 3405e42c-bddf-429b-959f-45be56057010
✅ Uploaded: Airtox_p14.json
🚀 Upload forsøg færdig
🔍 Airtox_p15.json - source_file = Airtox.pdf, id = 7d77e5b3-96d1-4596-b481-4de2aa2c913c
✅ Uploaded: Airtox_p15.json
🚀 Upload forsøg færdig
🔍 Airtox_p2.json - source_fil

In [None]:
import os
from azure.storage.blob import BlobServiceClient, ContentSettings

def upload_images_to_blob_storage(folder_path, connection_string, container_name):
    """
    Upload alle billedfiler i en mappe og dens undermapper til Azure Blob Storage.
    Bevarer relative sti i blob-navnene.
    dvs. blob beholder mappe struktur

    Args:
        folder_path (str): Rodmappe med billeder og undermapper.
        connection_string (str): Azure Blob Storage connection string.
        container_name (str): Navnet på Blob containeren.

    Returns:
        list: Liste over navne (blob-stier) på succesfuldt uploadede filer.
    """
    # allowed_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp') # hvis vi havde forskellige billedtyper
    uploaded_files = []

    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    # Opret container hvis den ikke findes
    try:
        container_client.get_container_properties()
    except Exception:
        container_client.create_container()

    # Gennemgå alle filer i alle undermapper
    for root, _, files in os.walk(folder_path):
        print(folder_path)
        for filename in files:
            if filename.lower().endswith(".png"): # allowed_extensions
                file_path = os.path.join(root, filename)
                
                # Udregn relativ sti fra rodmappen til filen
                relative_path = os.path.relpath(file_path, folder_path).replace("\\", "/")
                
                blob_client = container_client.get_blob_client(relative_path)
                
                with open(file_path, "rb") as data:
                    blob_client.upload_blob(
                        data,
                        overwrite=True,
                        content_settings=ContentSettings(content_type=f"image/{filename.split('.')[-1]}")
                    )
                    uploaded_files.append(relative_path)
                    print(f"✅ Uploadet: {relative_path}")

    print("🎉 Alle billeder er uploadet.")
    return uploaded_files
upload_images_to_blob_storage(output_dir + "/images", "DefaultEndpointsProtocol=https;AccountName=ragstoragevcs;AccountKey=chAgt3B0KWv1ppQdj8cVra4Gh+WrgYJUPQGZtxsKZBxVIEi4OT7qWna0/87sPhe81PDh3ppDUKKv+AStu62kkw==;EndpointSuffix=core.windows.net","kb-images-test-upload")

In [31]:
upload_images_to_blob_storage(output_dir + "/images", "DefaultEndpointsProtocol=https;AccountName=ragstoragevcs;AccountKey=chAgt3B0KWv1ppQdj8cVra4Gh+WrgYJUPQGZtxsKZBxVIEi4OT7qWna0/87sPhe81PDh3ppDUKKv+AStu62kkw==;EndpointSuffix=core.windows.net","kb-images-test-upload")