In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Run a quick validation that we have an entry for the OPEN_API_KEY within environment variables
assert "OPENAI_API_KEY" in os.environ, "OPENAI_API_KEY environment variable must be set"
assert "LLAMA_CLOUD_API_KEY" in os.environ, "LLAMA_CLOUD_API_KEY environment variable must be set"

# Define Parsing Model as mlflow "Model from code"

In [2]:
# %%writefile "tmp/chatgpt_file_upload_model.py"

# import os
# from openai import OpenAI, AsyncOpenAI
# from openai import files
# from typing import List, Dict, Any
# from mlflow.pyfunc import PythonModel
# from mlflow.models import set_model
# import asyncio
# import json
# import time


# class ChatgptFileParsing(PythonModel):
#     def __init__(self):
#         self.client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
#         self.model_str = "gpt-4o-mini"

#         # Define the prompt
#         self.prompt = """Answer from file context:
#         This PDF is a German document proposing an amendment to a Swiss federal law, or introducing a new federal law. 
#         Parse the document structure and return it in JSON format.
#         - Respond with only JSON without using markdown code blocks.
#         - The structure (or outline) should be a hierarchy of titles, sections, articles etc. There may also be one or more appendices.
#         - A section title consists of a roman numeral.
#         - Make sure you don't skip any headings and text inside paragraphs.
#         - Itemize each paragraph inside the articles as well.
#         - For each article, put its title separately from the actual paragraphs.
#         - For each paragraph, include its number or letter (as in the original document) in a separate JSON item.
#         - When there is a letter-indexed list inside of a paragraph, break out the list items as children of the paragraph. Make sure to place the index (letter or number) separately from the list item text.
#         - Place footnotes in their own JSON element. Replace the references to footnotes in the text with '{{footnote_id}}'.
#         - List all sections in the document.
#         - List all articles for each section.
#         - List all paragraphs for each article.
#         - Return a valid json. Don't fill in placeholders like "// List articles under this section".

#         JSON structure:
#         {
#             "document_title": "",
#             "amendment": "",
#             "sections": [
#                 {
#                 "section": "",
#                 "articles": [
#                     "article": "",
#                     "title": "",
#                     "minorities": "",
#                     "text": "",
#                     "paragraphs": [
#                     {
#                         "number": "",
#                         "text": "",
#                         "list": [
#                         {
#                             "index": "",
#                             "text": "",
#                         },
#                         ]
#                     },
#                     ],
#                 ],
#                 },
#             ],
#             "footnotes": [
#                 {
#                 "footnote_id": "",
#                 "text": "",
#                 },
#             ],
#         }
#         """

#     async def upload_file(self, filename: str) -> Dict[str, Any]:
#         # Upload the file to OpenAI
#         file = await self.client.files.create(
#             file=open(filename, "rb"), 
#             purpose="assistants"
#         )
#         return file

#     async def delete_file(self, file_id):
#         # Delete file from OpenAI
#         try:
#             response = await self.client.files.delete(file_id)
#             print(f"File {file_id} deleted successfully.")
#         except Exception as e:
#             print(f"Failed to delete file {file_id}: {e}")

#     async def parse_file(self, file_path: str, semaphore: asyncio.Semaphore) -> Dict[str, str]:
#         async with semaphore:
#             # Upload the file to OpenAI
#             print(f"Uploading file {file_path}...")
#             file = await self.upload_file(file_path)

#             # Create assistant
#             pdf_assistant = await self.client.beta.assistants.create(
#                 name="PDF assistant",
#                 model=self.model_str,
#                 description="An assistant to extract the contents of PDF files.",
#                 tools=[{"type": "file_search"}]
#             )

#             # Create thread
#             thread = await self.client.beta.threads.create(
#                 messages=[
#                     {
#                     "role": "user",
#                     "content": self.prompt,
#                     # Attach the new file to the message.
#                     "attachments": [
#                         { "file_id": file.id, "tools": [{"type": "file_search"}] }
#                     ],
#                     }
#                 ]
#             )

#             # Run thread
#             print("Running thread...")
#             run = await self.client.beta.threads.runs.create_and_poll(
#                 thread_id=thread.id, 
#                 assistant_id=pdf_assistant.id, 
#                 timeout=60
#             )
#             print("Thread completed.")

#             run_status = await self.client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
#             print(run_status.status)
#             if run_status.status != 'completed':
#                 return {'status': run_status.status}, run.usage.prompt_tokens, run.usage.completion_tokens

#             # Get messages
#             # messages = await self.client.beta.threads.messages.list(thread_id=thread.id)

#             messages_cursor = await self.client.beta.threads.messages.list(thread_id=thread.id)
#             messages = [message for message in messages_cursor]
#             # print(messages)

#             # Output text
#             text_res = messages[0][1][0].content[0].text.value

#             # Convert text to JSON
#             dict_res = json.loads(text_res)
                                
#             # Delete file from OpenAI
#             await self.delete_file(file.id)

#         return dict_res, run.usage.prompt_tokens, run.usage.completion_tokens


#     async def predict(self, context, model_input: List[str]) -> List[Dict[str, Any]]:
#         max_concurrent_tasks = 3
#         semaphore = asyncio.Semaphore(max_concurrent_tasks)
#         tasks = [self.parse_file(filename, semaphore) for filename in model_input]

#         results = await asyncio.gather(*tasks)

#         parsed_dicts, num_input_tokens, num_output_tokens = zip(*results)            
        
#         # # Use multiprocessing to parse multiple files concurrently
#         # with multiprocessing.Pool() as pool:
#         #     results = pool.map(parse_file, model_input)

#         return parsed_dicts, num_input_tokens, num_output_tokens
        
#         # results = []
#         # num_input_tokens = []
#         # num_output_tokens = []
#         # for file in model_input:
#         #     parsed_dict, num_input_token, num_output_token = parse_file(file)
#         #     results.append(parsed_dict)
#         #     num_input_tokens.append(num_input_token)
#         #     num_output_tokens.append(num_output_token)
        
#         # return results, num_input_tokens, num_output_tokens


# # Specify which definition in this script represents the model instance
# set_model(ChatgptFileParsing())

# Load Test PDF Files

In [23]:
# Create dataset

from pathlib import Path
from src.processing.process import extract_text_from_pdf
import pandas as pd
from typing import List

sample_documents_dir = "../sample-documents"

def load_file_paths(dir_path: str) -> List[str]:
    """
    Create a DataFrame with the file paths and the extracted text
    """
    dir_path = Path(dir_path)
    file_paths = list(dir_path.glob('**/*'))
    print("input files:")
    print("\n".join([str(s) for s in file_paths]))
    # df = pd.DataFrame([str(s) for s in file_paths], columns=["file_path"])

    return [str(s) for s in file_paths]

# Load the file paths
file_paths = load_file_paths(sample_documents_dir)


input files:
../sample-documents/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf
../sample-documents/bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565).pdf
../sample-documents/jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung).pdf


# Evaluate Parsing Model and Log to MLflow

In [40]:
# Prevent 'RuntimeError: This event loop is already running' in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()


import mlflow
from src.evaluation import evaluate
import json

# Define parameters
file_paths = file_paths
experiment_name="structure-extraction"
# model_path="../models/chatgpt_file_upload_model.py"
# model_path="../models/llama_parse_markdown_model.py"
model_path="../models/llama_parse_chatgpt_model.py"


"""
Evaluate the model and log the results to MLflow
"""
# Define the relative path for MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
exp_info = mlflow.get_experiment_by_name(experiment_name)
exp_id = exp_info.experiment_id if exp_info else mlflow.create_experiment(experiment_name)
description = model_path.split("/")[-1].replace(".py", "")

with mlflow.start_run(experiment_id=exp_id, description=description):
    # Log the model
    model_info = mlflow.pyfunc.log_model(
        python_model=model_path,
        artifact_path="model",
        #input_example=["../sample-documents/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf"],
    )

    # Load the model
    model = mlflow.pyfunc.load_model(model_info.model_uri)

    # Predict
    result_dict = model.predict(file_paths)

    parsed_dicts = result_dict["parsed_dicts"]
    markdown_texts = result_dict.get("markdown_texts", None)
    num_input_tokens = result_dict.get("num_input_tokens", None)
    num_output_tokens = result_dict.get("num_output_tokens", None)
    model_str = result_dict.get("model_str", None)

    # General
    mlflow.set_tag("description", model_path.split("/")[-1])
    mlflow.log_metric("num_files", len(file_paths))
    mlflow.log_metric("parsed_files", sum([p.get("status") != "failed" for p in parsed_dicts]))
    mlflow.log_metric("valid_schema", sum(evaluate.validate_json_schema(parsed_dicts)))
    
    # Costs
    if model_str and num_input_tokens and num_output_tokens:
        mlflow.log_metric("num_input_tokens", sum(num_input_tokens))
        mlflow.log_metric("num_output_tokens", sum(num_output_tokens))
        costs = evaluate.get_costs(model_str, num_input_tokens, num_output_tokens)
        mlflow.log_metric("costs", sum(costs))
        mlflow.log_param("model_str", model_str)

    # Extract text from PDFs using PyPDF2 and pdfminer
    pypdf2_texts = [extract_text_from_pdf(p, "PyPDF2") for p in file_paths]
    pdfminer_texts = [extract_text_from_pdf(p, "pdfminer") for p in file_paths]
    
    # Compare parsed dictionaries with the extracted text 
    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, pypdf2_texts)
    mlflow.log_metric("avg_percnt_missing_chars_pypdf2", sum(percnt_missing_characters) / len(file_paths))
    mlflow.log_metric("avg_percnt_added_chars_pypdf2", sum(percnt_added_characters) / len(file_paths))
    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, pdfminer_texts)
    mlflow.log_metric("avg_percnt_missing_chars_pdfminer", sum(percnt_missing_characters) / len(file_paths))
    mlflow.log_metric("avg_percnt_added_chars_pdfminer", sum(percnt_added_characters) / len(file_paths))


    mlflow.log_artifact(model_path)
    
    html_diff_pypdf2 = evaluate.compare_texts_html(parsed_dicts, pypdf2_texts)
    html_diff_pdfminer = evaluate.compare_texts_html(parsed_dicts, pdfminer_texts)
    
    # Log the parsed dictionaries and HTML diffs
    for i in range(len(file_paths)):
        filename_parsed = file_paths[i].split('/')[-1].replace(".pdf", "_parsed.json")
        print(filename_parsed)
        mlflow.log_text(json.dumps(parsed_dicts[i], indent=4), filename_parsed)
        
        if markdown_texts:
            filename_md = file_paths[i].split('/')[-1].replace(".pdf", "_md.md")
            print(filename_md)
            mlflow.log_text(markdown_texts[i], filename_md)

        filename_pypdf2 = file_paths[i].split('/')[-1].replace(".pdf", "_pypdf2.html")
        print(filename_pypdf2)
        mlflow.log_text(html_diff_pypdf2[i], filename_pypdf2)

        filename_pdfminer = file_paths[i].split('/')[-1].replace(".pdf", "_pdfminer.html")
        print(filename_pdfminer)
        mlflow.log_text(html_diff_pdfminer[i], filename_pdfminer)


[autoreload of code_model_d5670ec0040b4b30b63801f4b92fba96 failed: Traceback (most recent call last):
  File "/Users/thiloweber/miniforge3/envs/demokratis-ml-py312/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/thiloweber/miniforge3/envs/demokratis-ml-py312/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/Users/thiloweber/miniforge3/envs/demokratis-ml-py312/lib/python3.12/importlib/__init__.py", line 130, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'code_model_d5670ec0040b4b30b63801f4b92fba96'
]
2024/11/17 04:58:31 INFO mlflow.types.utils: Unsupported type hint: typing.Dict[str, typing.Any], skipping schema inference
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 514.56it/s]


Started parsing the file under job_id a1b0f66c-25ca-4dd2-bf9c-4dcf52dd5abc
Started parsing the file under job_id 81bdc070-9627-4d79-8a99-62c415451c91
Started parsing the file under job_id 13e27603-a7d3-4130-8364-f151f5d52702


2024/11/17 04:59:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run magnificent-ant-231 at: http://127.0.0.1:5000/#/experiments/359746332828124238/runs/cb6e055261494916844df6a002280b0b.
2024/11/17 04:59:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/359746332828124238.


51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_parsed.json
51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_md.md
51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_pypdf2.html
51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_pdfminer.html
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_parsed.json
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_md.md
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_pypdf2.html
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_pdfminer.html
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_parsed.json
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_md.md
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_pypdf2.html
jpxdh228 - zh - Entwu

In [38]:
print(model_str, num_input_tokens, num_output_tokens)

None 38264 19602


# Trying out stuff...

In [9]:
# Prevent 'RuntimeError: This event loop is already running' in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()

project_root = os.path.abspath(os.path.join(current_dir, '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
from models.llama_parse_chatgpt_model import LlamaParseMarkdownParser

model = LlamaParseMarkdownParser()

parsed_files = []
markdown_texts = []
for file_path in file_paths[:1]:
    # Parse the documents (pages) for file
    # documents = model.pdf_parser.load_data(file_path)

    # Insert footnotes into the text
    documents = model.markdown_insert_footnotes_into_text(documents)
    
    markdown_texts.append("\n".join([doc.text for doc in documents]))

    # Extract nodes from the documents
    nodes = model.extract_nodes(documents)

    # Convert nodes to JSON schema
    json_schema = model.nodes_to_json(nodes)
    parsed_files.append(json_schema)


[{'label': '', 'type': 'document', 'content': ['Vorentwurf'], 'children': []}, {'label': '', 'type': 'document', 'content': ['Bundesgesetz über Radio und Fernsehen (RTVG)', '(Abgabenanteile für lokale Radio- und regionale Fernsehveranstalter und Fördermassnahmen zugunsten der elektronischen Medien)', 'Änderung vom ...', 'Die Bundesversammlung der Schweizerischen Eidgenossenschaft, nach Einsicht in den Bericht der eidgenössischen Kommission für Verkehr und Fernmeldewesen des Ständerates [Datum des Entscheids der Kommission][^BBl 2024 ...] und in die Stellungnahme des Bundesrates vom [Datum][^BBl 2024 ...], beschliesst:', 'Minderheit (Friedli Esther, Stark) Nichteintreten', 'I', 'Das Bundesgesetz vom 24. März 2006[^SR 784.40] über Radio und Fernsehen wird wie folgt geändert:'], 'children': [{'label': '1', 'type': 'heading', 'content': ['Bundesgesetz über Radio und Fernsehen (RTVG)'], 'children': []}, {'label': '2', 'type': 'heading', 'content': ['(Abgabenanteile für lokale Radio- und reg

In [10]:
parsed_files

[{'label': '',
  'type': 'document',
  'content': [],
  'children': [{'label': '1',
    'type': 'heading',
    'content': ['Bundesgesetz über Radio und Fernsehen (RTVG)'],
    'children': []},
   {'label': '2',
    'type': 'heading',
    'content': ['(Abgabenanteile für lokale Radio- und regionale Fernsehveranstalter und Fördermassnahmen zugunsten der elektronischen Medien)'],
    'children': []},
   {'label': '3',
    'type': 'heading',
    'content': ['Änderung vom ...'],
    'children': []},
   {'label': '4',
    'type': 'heading',
    'content': ['Die Bundesversammlung der Schweizerischen Eidgenossenschaft,...'],
    'children': []},
   {'label': '5',
    'type': 'heading',
    'content': ['Minderheit (Friedli Esther, Stark) Nichteintreten'],
    'children': []},
   {'label': '6', 'type': 'heading', 'content': ['I'], 'children': []},
   {'label': '7',
    'type': 'heading',
    'content': ['Das Bundesgesetz vom 24. März 2006 über Radio und Fernsehen wird wie folgt geändert:'],
    