In [4]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(current_dir, '..', '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Run a quick validation that we have an entry for the OPEN_API_KEY within environment variables
assert "OPENAI_API_KEY" in os.environ, "OPENAI_API_KEY environment variable must be set"

# Define Parsing Model as mlflow "Model from code"

In [52]:
%%writefile "tmp/chatgpt_file_upload_model.py"

import os
from openai import OpenAI, AsyncOpenAI
from openai import files
from typing import List, Dict, Any
from mlflow.pyfunc import PythonModel
from mlflow.models import set_model
import asyncio
import json
import time


class ChatgptFileParsing(PythonModel):
    def __init__(self):
        self.client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
        self.model_str = "gpt-4o"

        # Define the prompt
        self.prompt = """Answer from file context:
        This PDF is a German document proposing an amendment to a Swiss federal law, or introducing a new federal law. 
        Parse the document structure and return it in JSON format.
        - Respond with only JSON without using markdown code blocks.
        - The structure (or outline) should be a hierarchy of titles, sections, articles etc. There may also be one or more appendices.
        - A section title consists of a roman numeral.
        - Make sure you don't skip any headings and text inside paragraphs.
        - Itemize each paragraph inside the articles as well.
        - For each article, put its title separately from the actual paragraphs.
        - For each paragraph, include its number or letter (as in the original document) in a separate JSON item.
        - When there is a letter-indexed list inside of a paragraph, break out the list items as children of the paragraph. Make sure to place the index (letter or number) separately from the list item text.
        - Place footnotes in their own JSON element. Replace the references to footnotes in the text with '{{footnote_id}}'.
        - List all sections in the document.
        - List all articles for each section.
        - List all paragraphs for each article.
        - Return a valid json. Don't fill in placeholders like "// List articles under this section".

        JSON structure:
        {
            "document_title": "",
            "amendment": "",
            "sections": [
                {
                "section": "",
                "articles": [
                    "article": "",
                    "title": "",
                    "minorities": "",
                    "text": "",
                    "paragraphs": [
                    {
                        "number": "",
                        "text": "",
                        "list": [
                        {
                            "index": "",
                            "text": "",
                        },
                        ]
                    },
                    ],
                ],
                },
            ],
            "footnotes": [
                {
                "footnote_id": "",
                "text": "",
                },
            ],
        }
        """

    async def upload_file(self, filename: str) -> Dict[str, Any]:
        # Upload the file to OpenAI
        file = await self.client.files.create(
            file=open(filename, "rb"), 
            purpose="assistants"
        )
        return file

    async def delete_file(self, file_id):
        # Delete file from OpenAI
        try:
            response = await self.client.files.delete(file_id)
            print(f"File {file_id} deleted successfully.")
        except Exception as e:
            print(f"Failed to delete file {file_id}: {e}")

    async def parse_file(self, file_path: str, semaphore: asyncio.Semaphore) -> Dict[str, str]:
        async with semaphore:
            # Upload the file to OpenAI
            print(f"Uploading file {file_path}...")
            file = await self.upload_file(file_path)

            # Create assistant
            pdf_assistant = await self.client.beta.assistants.create(
                name="PDF assistant",
                model=self.model_str,
                description="An assistant to extract the contents of PDF files.",
                tools=[{"type": "file_search"}]
            )

            # Create thread
            thread = await self.client.beta.threads.create(
                messages=[
                    {
                    "role": "user",
                    "content": self.prompt,
                    # Attach the new file to the message.
                    "attachments": [
                        { "file_id": file.id, "tools": [{"type": "file_search"}] }
                    ],
                    }
                ]
            )

            # Run thread
            print("Running thread...")
            run = await self.client.beta.threads.runs.create_and_poll(
                thread_id=thread.id, 
                assistant_id=pdf_assistant.id, 
                timeout=60
            )
            print("Thread completed.")

            run_status = await self.client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
            print(run_status.status)
            if run_status.status != 'completed':
                return {'status': run_status.status}, run.usage.prompt_tokens, run.usage.completion_tokens

            # Get messages
            # messages = await self.client.beta.threads.messages.list(thread_id=thread.id)

            messages_cursor = await self.client.beta.threads.messages.list(thread_id=thread.id)
            messages = [message for message in messages_cursor]
            # print(messages)

            # Output text
            text_res = messages[0][1][0].content[0].text.value

            # Convert text to JSON
            dict_res = json.loads(text_res)
                                
            # Delete file from OpenAI
            await self.delete_file(file.id)

        return dict_res, run.usage.prompt_tokens, run.usage.completion_tokens


    async def predict(self, context, model_input: List[str]) -> List[Dict[str, Any]]:
        max_concurrent_tasks = 3
        semaphore = asyncio.Semaphore(max_concurrent_tasks)
        tasks = [self.parse_file(filename, semaphore) for filename in model_input]

        results = await asyncio.gather(*tasks)

        parsed_dicts, num_input_tokens, num_output_tokens = zip(*results)            
        
        # # Use multiprocessing to parse multiple files concurrently
        # with multiprocessing.Pool() as pool:
        #     results = pool.map(parse_file, model_input)

        return parsed_dicts, num_input_tokens, num_output_tokens
        
        # results = []
        # num_input_tokens = []
        # num_output_tokens = []
        # for file in model_input:
        #     parsed_dict, num_input_token, num_output_token = parse_file(file)
        #     results.append(parsed_dict)
        #     num_input_tokens.append(num_input_token)
        #     num_output_tokens.append(num_output_token)
        
        # return results, num_input_tokens, num_output_tokens


# Specify which definition in this script represents the model instance
set_model(ChatgptFileParsing())

Writing tmp/chatgpt_file_upload_model.py


# Load Test PDF Files

In [12]:
# Create dataset

from pathlib import Path
from src.preprocessing import preprocess
import pandas as pd


# Get all file paths in the 'sample-documents' folder
sample_documents_dir = Path('../sample-documents')
file_paths = list(sample_documents_dir.glob('**/*'))
print(file_paths)

test_df = pd.DataFrame([str(s) for s in file_paths], columns=["file_path"])

test_df["pypdf2_text"] = test_df["file_path"].apply(lambda x: preprocess.extract_text_from_pdf(x, "PyPDF2"))    
test_df["pdfminer_text"] = test_df["file_path"].apply(lambda x: preprocess.extract_text_from_pdf(x, "pdfminer"))    

test_df

[PosixPath('../sample-documents/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf'), PosixPath('../sample-documents/bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565).pdf'), PosixPath('../sample-documents/jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung).pdf')]


Unnamed: 0,file_path,pypdf2_text,pdfminer_text
0,../sample-documents/51276-de-DRAFT-92be4e18116...,\n«$$e-seal» «$$QrCode »\n2024-... «%ASFF_YYYY...,«$$QrCode»\n Vorentwurf\n «$$e-seal»\n Änderun...
1,../sample-documents/bp6wfzuy - zg - Entwurf de...,Kanton Zug [Fundst. od. Gesch.-Nr.] (ID 2565)\...,Kanton Zug\n[Fundst. od. Gesch.-Nr.] (ID 2565)...
2,../sample-documents/jpxdh228 - zh - Entwurf-1_...,\n Kanton Zürich\nBaudirektion\nVernehmlassung...,Entwurf\n26. August 2021\nKanton Zürich\nBaudi...


# Evaluate Parsing Model and Log to mlflow

In [55]:
import mlflow
from src.evaluation import evaluate
import json

mlflow.set_experiment("structure-extraction")

model_path = "tmp/chatgpt_file_upload_model.py"

with mlflow.start_run():
    # Log the model
    model_info = mlflow.pyfunc.log_model(
        python_model=model_path,
        artifact_path="chagpt_file_upload_model",
        #input_example=["../sample-documents/51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2.pdf"],
    )

    # Load the model
    model = mlflow.pyfunc.load_model(model_info.model_uri)

    # Predict
    parsed_dicts, num_input_tokens, num_output_tokens = await model.predict(test_df["file_path"].tolist())
    test_df["parsed_dicts"] = parsed_dicts
    test_df["num_input_tokens"] = num_input_tokens
    test_df["num_output_tokens"] = num_output_tokens

    mlflow.log_param("model", model.unwrap_python_model().model_str)

    mlflow.log_metric("num_files", len(test_df))
    mlflow.log_metric("parsed_files", sum([p.get("status") != "failed" for p in test_df["parsed_dicts"].tolist()]))
    
    mlflow.log_metric("num_input_tokens", sum(num_input_tokens))
    mlflow.log_metric("num_output_tokens", sum(num_output_tokens))
    
    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, test_df["pypdf2_text"].tolist())
    test_df["avg_percnt_missing_chars_pypdf2"] = percnt_missing_characters
    test_df["avg_percnt_added_chars_pypdf2"] = percnt_added_characters
    mlflow.log_metric("avg_percnt_missing_chars_pypdf2", sum(percnt_missing_characters) / len(test_df))
    mlflow.log_metric("avg_percnt_added_chars_pypdf2", sum(percnt_added_characters) / len(test_df))

    percnt_missing_characters, percnt_added_characters = evaluate.percnt_missing_and_added_characters(parsed_dicts, test_df["pdfminer_text"].tolist())
    test_df["avg_percnt_missing_chars_pdfminer"] = percnt_missing_characters
    test_df["avg_percnt_added_chars_pdfminer"] = percnt_added_characters
    mlflow.log_metric("avg_percnt_missing_chars_pdfminer", sum(percnt_missing_characters) / len(test_df))
    mlflow.log_metric("avg_percnt_added_chars_pdfminer", sum(percnt_added_characters) / len(test_df))

    costs = evaluate.get_costs(
        model.unwrap_python_model().model_str, 
        num_input_tokens, 
        num_output_tokens
    )
    test_df["costs"] = costs
    mlflow.log_metric("costs", test_df["costs"].sum())

    mlflow.log_artifact(model_path)
    
    test_df["html_diff_pypdf2"] = evaluate.compare_texts_html(parsed_dicts, test_df["pypdf2_text"])
    test_df["html_diff_pdfminer"] = evaluate.compare_texts_html(parsed_dicts, test_df["pdfminer_text"])
    
    # Log the parsed dictionaries and HTML diffs
    for idx, row in test_df.iterrows():
        filename_parsed = row["file_path"].split('/')[-1].replace(".pdf", "_parsed.json")
        print(filename_parsed)
        mlflow.log_text(json.dumps(row["parsed_dicts"], indent=4), filename_parsed)

        filename_pypdf2 = row["file_path"].split('/')[-1].replace(".pdf", "_pypdf2.html")
        print(filename_pypdf2)
        mlflow.log_text(row["html_diff_pypdf2"], filename_pypdf2)

        filename_pdfminer = row["file_path"].split('/')[-1].replace(".pdf", "_pdfminer.html")
        mlflow.log_text(row["html_diff_pdfminer"], filename_pdfminer)
        


2024/11/01 14:49:41 INFO mlflow.types.utils: Unsupported type hint: typing.List[typing.Dict[str, typing.Any]], skipping schema inference


51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_parsed.json
51276-de-DRAFT-92be4e18116eab4615da2a4279771eb05b4f47e2_pypdf2.html
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_parsed.json
bp6wfzuy - zg - Entwurf des totalrevidierten Gesetzes ueber Ausbildungsbeitraege (ID 2565)_pypdf2.html
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_parsed.json
jpxdh228 - zh - Entwurf-1_(EnerG-Aenderung-Staerkung-Versorgungssicherheit_Vernehmlassung)_pypdf2.html
