In [1]:
!python database_helpers.py

  Base = declarative_base()


In [2]:
import pymupdf
from typing import Optional
from llama_cpp import LogitsProcessorList, Llama
from lmformatenforcer import CharacterLevelParser
from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor
from IPython.display import display, Markdown
from llama_cpp import Llama
from lmformatenforcer import JsonSchemaParser
from huggingface_hub import hf_hub_download
from database_helpers import (
    ForeclosureCaseSchema,
    ForeclosureObjectSchema,
    ForclosureModel,
    engine,
    session
)
import os
import tempfile
import requests

In [3]:
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. 
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don't know the answer to a question, please don't share false information.
"""

def get_prompt(message: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f'[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]'

def pdf_to_string(file_path):
    pdf_document = pymupdf.open(file_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text += page.get_text()
    pdf_document.close()
    return text


def display_header(text):
    display(Markdown(f'**{text}**'))

def display_content(text):
    display(Markdown(f'```\n{text}\n```'))

def llamacpp_with_character_level_parser(llm: Llama, prompt: str, character_level_parser: Optional[CharacterLevelParser]) -> str:
    logits_processors: Optional[LogitsProcessorList] = None
    if character_level_parser:
        logits_processors = LogitsProcessorList([build_llamacpp_logits_processor(llm, character_level_parser)])
    
    output = llm(prompt, logits_processor=logits_processors, max_tokens=1000)
    text: str = output['choices'][0]['text']
    return text

downloaded_model_path = hf_hub_download(repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", filename="Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf") 
llm = Llama(model_path=downloaded_model_path, n_ctx=16384, n_threads=8, n_gpu_layers=-1, verbose=False, temperature=0)

In [4]:
def download_pdf(url, destination_path):
    try:
        response = requests.get(url, headers={"Referer": "https://www.zvg-portal.de/index.php?button=Suchen"}, stream=True)
        response.raise_for_status()
        if 'application/pdf' not in response.headers.get('Content-Type', ''):
            raise Exception("The URL does not point to a valid PDF file.")

        with open(destination_path, 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk: 
                    pdf_file.write(chunk)

        return True
    except Exception as e:
        return False

In [5]:
with open("../experiments/examples/5shot.txt", "r", encoding="UTF-8") as file:
    examples = file.read()

valid_idx = 0
all_forclosures = session.query(ForeclosureCaseSchema).all()
for idx, foreclosure_case in enumerate(all_forclosures):
    
    if idx < 4700 and not idx in [2597, 4360]:
        continue
    
    print(idx)
    if session.query(ForeclosureObjectSchema).filter_by(foreclosurecase_link = foreclosure_case.link).first() or not foreclosure_case.amtliche_bekanntmachung:
        continue

    with tempfile.TemporaryDirectory() as tmp_dir:
        try:
            pdf_path = os.path.join(tmp_dir, "amtliche_bekanntmachung.pdf")
            amtliche_bekanntmachung_link = f"https://www.zvg-portal.de/index.php{foreclosure_case.amtliche_bekanntmachung}"
            if not download_pdf(amtliche_bekanntmachung_link, pdf_path):
                continue
            pdf_text = pdf_to_string(pdf_path)
            
            question_with_schema = (
                f"The following text lists examples for your task: {examples}.\n"
                f"You MUST answer using the following JSON schema: {ForclosureModel.model_json_schema()}.\n"
                f"Please extract information about the following PDF content: {pdf_text}."
            )
            prompt = get_prompt(question_with_schema)
            result = llamacpp_with_character_level_parser(
                llm, prompt, JsonSchemaParser(ForclosureModel.model_json_schema())
            )

            if not result:
                continue
            
            foreclosure_data = ForclosureModel.model_validate_json(result.replace("\r\n", "\n"))
            foreclosure_case.verkehrswert = foreclosure_data.gesamtverkehrswert
            
            for obj in foreclosure_data.objekte:
                obj_dict = obj.model_dump()
                obj_dict['raum_typen'] =",".join(obj.raum_typen)
                obj_data = ForeclosureObjectSchema(
                    foreclosurecase_link = foreclosure_case.link,
                    **obj_dict
                )
                session.add(obj_data)
                
            session.commit()
            valid_idx += 1
            if valid_idx % 10 == 0:
                print(f"Processed: {idx} Valid: {valid_idx} Overall: {len(all_forclosures)}")  
                
        except Exception as e:
            print(f"Error processing foreclosure case ID {foreclosure_case.link}: {e}")
            session.rollback()

2597
Error processing foreclosure case ID https://www.zvg-portal.de/index.php?button=showZvg&zvg_id=52907&land_abk=he: 1 validation error for ForclosureModel
  Invalid JSON: EOF while parsing a string at line 1 column 2301 [type=json_invalid, input_value='  { "objekte": [ { "flae...l, "raum_typen": [], "b', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/json_invalid
4360
Error processing foreclosure case ID https://www.zvg-portal.de/index.php?button=showZvg&zvg_id=2117&land_abk=rp: 1 validation error for ForclosureModel
  Invalid JSON: EOF while parsing an object at line 3 column 2361 [type=json_invalid, input_value=' \n\n{ "objekte": [ { "r...430 }, { "raeume": null', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/json_invalid


In [6]:
session.close()
engine.dispose()
llm.close()