In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Run a quick validation that we have an entry for the OPEN_API_KEY within environment variables
assert "OPENAI_API_KEY" in os.environ, "OPENAI_API_KEY environment variable must be set"
assert "LLAMA_CLOUD_API_KEY" in os.environ, "LLAMA_CLOUD_API_KEY environment variable must be set"

# Load Test PDF Files

In [23]:
# Create dataset

from pathlib import Path
from src.processing.process import extract_text_from_pdf
import pandas as pd
from typing import List

sample_documents_dir = "../sample-documents"

def load_file_paths(dir_path: str) -> List[str]:
    """
    Create a DataFrame with the file paths and the extracted text
    """
    dir_path = Path(dir_path)
    file_paths = list(dir_path.glob('**/*'))
    file_paths = [str(Path(fp).relative_to(dir_path)) for fp in file_paths if Path(fp).suffix == ".pdf"]
    print("input files:")
    print("\n".join(file_paths))
    # df = pd.DataFrame([str(s) for s in file_paths], columns=["file_path"])

    return file_paths

# Load the file paths
file_paths = load_file_paths(sample_documents_dir)


input files:
51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565).pdf
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung).pdf
ur/consultation_4938_2024-10-25/52908-de-VARIOUS_TEXT-8c60b02a6fcdec2c36cdf03798228246c7e3ec17.pdf
be/consultation_4815_2024-07-05/51795-de-VARIOUS_TEXT-311a4a0b45e158432d632d85e8923e7f5a44703c.pdf
be/consultation_4815_2024-07-05/51796-de-VARIOUS_TEXT-d7895c24109e0cbf25703a63e441e0e88ee8ac66.pdf
gr/consultation_4950_2024-11-12/53093-de-DRAFT-9db14301e7aa5a6ad4dcd2159d727dfed7cb44bb.pdf
ow/consultation_4898_2024-09-10/52543-de-VARIOUS_TEXT-82873ab353d57790835154655a8354602662464a.pdf
ow/consultation_4898_2024-09-10/52544-de-VARIOUS_TEXT-b15ceaeede1e67f131613005de5d9e520bab7e86.pdf
ch/consultation_4956_2024-11-21/53136-fr-DRAFT-8ebcf00d65424618bcfd8b2b792bfacdc7514935.pdf
ch/consultation_4956_2024-11-21/53137-it-DRAFT-9231653af57

In [63]:
file_paths[:2]

base_path = Path("..")
filename = Path(file_paths[0]).stem
file_paths[3]#.split("/")[-2:]



'ur/consultation_4938_2024-10-25/52908-de-VARIOUS_TEXT-8c60b02a6fcdec2c36cdf03798228246c7e3ec17.pdf'

In [77]:
raw_texts

[]

# Evaluate Parsing Model and Log to MLflow

Run local MLflow tracking server in terminal, this creates and writes to the directories `research/mlruns` and `research/mlartifacts`:
````
cd research && mlflow ui
````


In [115]:
# Prevent 'RuntimeError: This event loop is already running' in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()


import mlflow
from src.evaluation import evaluate
import json

# Define parameters
file_paths = file_paths
experiment_name="structure-extraction"
# model_path="../models/chatgpt_file_upload_model.py"
model_path="../models/llama_parse_markdown_model.py"
# model_path="../models/llama_parse_chatgpt_model.py"


"""
Evaluate the model and log the results to MLflow
"""
# Define the relative path for MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
exp_info = mlflow.get_experiment_by_name(experiment_name)
exp_id = exp_info.experiment_id if exp_info else mlflow.create_experiment(experiment_name)
description = model_path.split("/")[-1].replace(".py", "")

file_paths2 = file_paths[:]

with mlflow.start_run(experiment_id=exp_id, description=description):
    # Log the model
    model_info = mlflow.pyfunc.log_model(
        python_model=model_path,
        artifact_path="model",
        #input_example=["../sample-documents/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf"],
    )

    # Load the model
    model = mlflow.pyfunc.load_model(model_info.model_uri)

    # Predict
    result_dict = model.predict(file_paths2)

    parsed_dicts = result_dict["parsed_files"]
    raw_texts = result_dict.get("raw_texts", None)
    markdown_texts = result_dict.get("parsed_markdowns", None)
    num_input_tokens = result_dict.get("num_input_tokens", None)
    num_output_tokens = result_dict.get("num_output_tokens", None)
    model_str = result_dict.get("model_str", None)

    # General
    mlflow.set_tag("description", model_path.split("/")[-1])
    mlflow.log_metric("num_files", len(file_paths2))
    mlflow.log_metric("parsed_files", sum([p.get("status") != "failed" for p in parsed_dicts]))
    valid_schemas = evaluate.validate_json_schema(parsed_dicts)
    mlflow.log_metric("valid_schema", sum(valid_schemas))
    
    # Costs
    if model_str and num_input_tokens and num_output_tokens:
        mlflow.log_metric("num_input_tokens", sum(num_input_tokens))
        mlflow.log_metric("num_output_tokens", sum(num_output_tokens))
        costs = evaluate.get_costs(model_str, num_input_tokens, num_output_tokens)
        mlflow.log_metric("costs", sum(costs))
        mlflow.log_param("model_str", model_str)

    # Extract text from PDFs using PyPDF2 and pdfminer
    pypdf2_texts = [extract_text_from_pdf("../sample-documents/" + p, "PyPDF2") for p in file_paths2]
    pdfminer_texts = [extract_text_from_pdf("../sample-documents/" + p, "pdfminer") for p in file_paths2]
    
    # Compare parsed dictionaries with the extracted text 
    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, pypdf2_texts)
    mlflow.log_metric("avg_percnt_missing_chars_pypdf2", sum(percnt_missing_characters) / len(file_paths2))
    mlflow.log_metric("avg_percnt_added_chars_pypdf2", sum(percnt_added_characters) / len(file_paths2))
    mlflow.log_text(
        pd.DataFrame(
            zip(file_paths2, percnt_missing_characters, percnt_added_characters, valid_schemas), 
            columns=["file_path", "percnt_missing_characters", "percnt_added_characters", "valid_schema"]
        ).to_csv(index=False),
        "evaluation_all_files_pypdf2.csv"
    )
    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, pdfminer_texts)
    mlflow.log_metric("avg_percnt_missing_chars_pdfminer", sum(percnt_missing_characters) / len(file_paths2))
    mlflow.log_metric("avg_percnt_added_chars_pdfminer", sum(percnt_added_characters) / len(file_paths2))
    mlflow.log_text(
        pd.DataFrame(
            zip(file_paths2, percnt_missing_characters, percnt_added_characters, valid_schemas), 
            columns=["file_path", "percnt_missing_characters", "percnt_added_characters", "valid_schema"]
        ).to_csv(index=False),
        "evaluation_all_files_pdfminer.csv"
    )

    mlflow.log_artifact(model_path)
    
    html_diff_pypdf2 = evaluate.compare_texts_html(parsed_dicts, pypdf2_texts)
    html_diff_pdfminer = evaluate.compare_texts_html(parsed_dicts, pdfminer_texts)
    if raw_texts:
        html_diff_raw = evaluate.compare_texts_html(parsed_dicts, raw_texts)
    
    # Log the parsed dictionaries and HTML diffs
    for i in range(len(file_paths2)):
        # filename_parsed = file_paths2[i].split('/')[-1].replace(".pdf", "_parsed.json")
        # print(filename_parsed)
        # mlflow.log_text(json.dumps(parsed_dicts[i], indent=4), filename_parsed)
        
        # if markdown_texts:
        #     filename_md = file_paths2[i].split('/')[-1].replace(".pdf", "_md.md")
        #     print(filename_md)
        #     mlflow.log_text(markdown_texts[i], filename_md)

        # filename_pypdf2 = file_paths2[i].split('/')[-1].replace(".pdf", "_pypdf2.html")
        # print(filename_pypdf2)
        # mlflow.log_text(html_diff_pypdf2[i], filename_pypdf2)

        # filename_pdfminer = file_paths2[i].split('/')[-1].replace(".pdf", "_pdfminer.html")
        # print(filename_pdfminer)
        # mlflow.log_text(html_diff_pdfminer[i], filename_pdfminer)

        base_path = Path("..")
        html_path = base_path / "sample-outputs" / file_paths2[i].replace(".pdf", "_pypdf2_diff.html")
        with open(html_path, "w") as f:
            f.write(html_diff_pypdf2[i])



2024/12/13 17:02:00 INFO mlflow.types.utils: Unsupported type hint: typing.Dict[str, typing.Any], skipping schema inference
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 455.30it/s]


Loading cached result from ../intermediate_results/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_llamaparse.pkl
Writing parsed markdown with footnotes to ../intermediate_results/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_footnotes.md
Writing parsed JSON to ../sample-outputs/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_json_schema.json
Loading cached result from ../intermediate_results/bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_llamaparse.pkl
Writing parsed markdown with footnotes to ../intermediate_results/bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_footnotes.md
Writing parsed JSON to ../sample-outputs/bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_json_schema.json
Loading cached result from ../intermediate_results/jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_llamaparse.

2024/12/13 17:02:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run redolent-flea-791 at: http://127.0.0.1:5000/#/experiments/359746332828124238/runs/b3603405136f440bb7f4637d9404e644.
2024/12/13 17:02:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/359746332828124238.


# Remove all "deleted" artifacts and runs

In [None]:
import shutil

mlflow.get_run("0b1a0326da3b40a18b2f5600965850ec").info.lifecycle_stage
display(mlflow.search_experiments())

experient_id = "359746332828124238"
# run_view_type=2 == only deleted runs
df = mlflow.search_runs(experiment_ids=[experient_id], run_view_type=2)

for run_id in df.run_id:
    # Define the directory to be deleted
    dir_to_delete = f"/Users/thiloweber/Repositories/demokratis-ml/research/mlartifacts/{experient_id}/{run_id}"
    shutil.rmtree(dir_to_delete)

    dir_to_delete = f"/Users/thiloweber/Repositories/demokratis-ml/research/mlruns/{experient_id}/{run_id}"
    shutil.rmtree(dir_to_delete)

mlflow.search_runs(experiment_ids=[experient_id], run_view_type=2)


[<Experiment: artifact_location='mlflow-artifacts:/359746332828124238', creation_time=1731813855663, experiment_id='359746332828124238', last_update_time=1731813855663, lifecycle_stage='active', name='structure-extraction', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1731813834126, experiment_id='0', last_update_time=1731813834126, lifecycle_stage='active', name='Default', tags={}>]

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time


# Trying out stuff...

In [69]:
# Test: LlamaParse API request, continous_mode and JSON response

from llama_parse import LlamaParse

parsing_instruction = """
This PDF is a German document proposing an amendment to a Swiss federal law, or introducing a new federal law.
- Format all article titles as headers.
- Combine multiple lines for each paragraph and list item into one line, also if words are split at the end of a line with "-".
- List all paragraphs for each article as unordered lists. Set list indices in square brackets, where existing, e.g. "[1.], [a.], [abis.], ...".
- Text on top of a new page might still belong to the article on the previous page. Merge this text with the previous page and don't invent artificial headers.
- If there is a footnote section at the bottom of a page, start it with an extra header "# [Fussnoten]" and list footnotes section as unordered list. Each footnote should start with the identifier in square brackets with "^", e.g., "[^1], [^2], ...".
- If there are references to footnotes within the text in the middle of a page, set the references in square brackets with "^", e.g., "[^1], [^2], ...". Sometimes the reference is following a number, e.g., "2006[^3], 1997[^4], ...".
- Do not insert a note or any other text that doesn't exist in the document.
"""

params = {
    'parsing_instruction': parsing_instruction,
    'language': 'de',
    "bounding_box": "0.09,0,0.07,0",
    "take_screenshot": False,
    #"premium_mode": True,
    "continuous_mode": True,
    # "page_separator": "\n=================\n",
}

# Initialize the LlamaParse parser
pdf_parser = LlamaParse(**params)

pdf_parser.load_data(file_paths[0])

import requests
import time

headers = {"Authorization": f'Bearer {os.environ["LLAMA_CLOUD_API_KEY"]}'}
# file_path = "./attention.pdf"
base_url = "https://api.cloud.llamaindex.ai/api/parsing"

# with open(file_path, "rb") as f:
#     # mime_type = mimetypes.guess_type(file_path)[0]
#     # files = {"file": (f.name, f, mime_type)}
files = [(
    'file', 
    (os.path.basename(file_path), open(file_path, 'rb'), 'application/pdf')
    ) for file_path in file_paths[:1]]


# send the request, upload the file
url = f"{base_url}/upload"
response = requests.post(url, headers=headers, files=files)

response.raise_for_status()
# get the job id for the result_url
job_id = response.json()["id"]
result_type = "json"  # or "markdown" or "text"
result_url = f"{base_url}/job/{job_id}/result/{result_type}"

# check for the result until its ready
while True:
    response = requests.get(result_url, headers=headers, data=params)
    if response.status_code == 200:
        break

    time.sleep(2)

# download the result
if result_type == "json":
    result = response.json()
    output = result["pages"]
    print(json.dumps(result['job_metadata'], indent=4))
    print(json.dumps(output, indent=4))
else:
    result = response.json()
    output = result[result_type]
    print(output)


Started parsing the file under job_id 494a43d1-45de-49f6-8497-b172fc5bbe43
{
    "credits_used": 10.0,
    "job_credits_usage": 0,
    "job_pages": 0,
    "job_is_cache_hit": true,
    "credits_max": 1000
}
[
    {
        "page": 1,
        "text": "Schweizerische Eidgenossenschaft                                                       \u00ab$$QrCode\u00bb\nContederation suisse                              \u00ab$$e-seal\u00bb\nContederazione Svizzera\nConfederaziun svizra\n                                                                                       Vorentwurf\nBundesgesetz\n\u00fcber Radio und Fernsehen\n(RTVG)\n\n(Abgabenanteile f\u00fcr lokale Radio- und regionale Fernsehveranstalter\nund F\u00f6rdermassnahmen zugunsten der elektronischen Medien)\n\n\u00c4nderung vom \u2026\n\nDie Bundesversammlung der Schweizerischen Eidgenossenschaft,\nnach Einsicht in den Bericht der eidgen\u00f6ssischen Kommission f\u00fcr Verkehr und\nFernmeldewesen des St\u00e4nderates [Datum des Ent

In [70]:
# Compare markdown response

result_type = "markdown"  # or "json" or "text"
result_url = f"{base_url}/job/{job_id}/result/{result_type}"

# check for the result until its ready
while True:
    response = requests.get(result_url, headers=headers, data=params)
    if response.status_code == 200:
        break

    time.sleep(2)

# download the result
if result_type == "json":
    result = response.json()
    output = result["pages"]
    print(json.dumps(result['job_metadata'], indent=4))
    print(json.dumps(output, indent=4))
else:
    result = response.json()
    output = result[result_type]
    print(output)


# Vorentwurf

# Bundesgesetz über Radio und Fernsehen (RTVG)

# (Abgabenanteile für lokale Radio- und regionale Fernsehveranstalter und Fördermassnahmen zugunsten der elektronischen Medien)

# Änderung vom …

Die Bundesversammlung der Schweizerischen Eidgenossenschaft, nach Einsicht in den Bericht der eidgenössischen Kommission für Verkehr und Fernmeldewesen des Ständerates [Datum des Entscheids der Kommission]1 und in die Stellungnahme des Bundesrates vom [Datum]2, beschliesst:

# Minderheit (Friedli Esther, Stark)

Nichteintreten

# I

Das Bundesgesetz vom 24. März 20063 über Radio und Fernsehen wird wie folgt geändert:

# Art. 1 Sachüberschrift, Abs. 1 und 1bis

# Gegenstand und Geltungsbereich

1 Dieses Gesetz regelt:

- a. die Veranstaltung, die Aufbereitung, die Übertragung und den Empfang von Radio- und Fernsehprogrammen;
- b. die Fördermassnahmen zugunsten der elektronischen Medien.

1 BBl 2024 …

2 BBl 2024 …

3 SR 784.40

2024-...
---
# Bundesgesetz über Radio und Fernsehen

