# Create Legal DB

This notebook takes the raw XML data, converts it to JSON, and then creates a VectorDB from it.

In [2]:
import json
from dotenv import load_dotenv
load_dotenv()

from backend.database import LawArticle, law_book_db, CaseRecord, case_records_db

RuntimeError: Storage folder storage/qdrant is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.

## XML -> JSON

In [None]:
import xml.etree.ElementTree as ET

# Load the XML file
xml_files = [
    "data/BJNR001950896.xml",
    "data/BJNR005330950.xml",
    "data/BJNR258700008.xml",
]

for xml_file in xml_files:
    for xml_file in xml_files:
        # Load the XML file
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # List to store extracted information for the current XML file
        sections = []


        # Iterate through each 'norm' element in the XML
        for norm_element in root.findall("norm"):
            metadaten = norm_element.find("metadaten")
            textdaten = norm_element.find("textdaten/text/Content/P")
            enbez_element = metadaten.find("enbez")
            title_element = metadaten.find("titel")


            # Extract relevant information
            jurabk = metadaten.find("jurabk").text
            enbez = enbez_element.text if enbez_element is not None and enbez_element.text is not None else ""
            titel = title_element.text if title_element is not None and title_element.text is not None else ""
            content = textdaten.text if textdaten is not None else ""

            # Create a dictionary with extracted information
            section_info = {"jurabk": jurabk, "enbez": enbez, "titel": titel, "content": content}

            # Append the dictionary to the list
            sections.append(section_info)
        import json
        # Create a JSON file for the current XML file
        json_filename = f"{xml_file.split('/')[-1].replace('.xml', '_output.json')}"
        with open(json_filename, "w", encoding="utf-8") as json_file:
            json.dump(sections, json_file, ensure_ascii=False, indent=2)
            

## JSON -> LawBook DB

In [1]:
def load_lawbook(book):
    BGB, ZPO, FamFG = None, None, None  

    with open('data/BJNR001950896_output.json', 'r') as f:
        BGB_data = json.load(f)
        BGB = [
            LawArticle(
                book=article['jurabk'],
                article_number=article['enbez'],
                title=article['titel'],
                content=article['content']
            )
            for article in BGB_data
            if article['jurabk'] == 'BGB'
            and article.get('enbez')
            and article.get('titel')
            and article.get('content')
        ]

    with open('data/BJNR005330950_output.json', 'r') as f:
        ZPO_data = json.load(f)
        ZPO = [
            LawArticle(
                book=article['jurabk'],
                article_number=article['enbez'],
                title=article['titel'],
                content=article['content']
            )
            for article in ZPO_data
            if article['jurabk'] == 'ZPO'
            and article.get('enbez')
            and article.get('titel')
            and article.get('content')
        ]
    
    with open('data/BJNR258700008_output.json', 'r') as f:
        FamFG_data = json.load(f)
        FamFG = [
            LawArticle(
                book=article['jurabk'],
                article_number=article['enbez'],
                title=article['titel'],
                content=article['content']
            )
            for article in FamFG_data
            if article['jurabk'] == 'FamFG'
            and article.get('enbez')
            and article.get('titel')
            and article.get('content')
        ]

    legal_text_by_book = {
        'BGB': BGB,
        'ZPO': ZPO, 
        'FamFG': FamFG
    }

    return legal_text_by_book.get(book, [])

In [None]:
books = ['BGB', 'ZPO', 'FamFG']

for book in books:
    law_articles = load_lawbook(book)
    law_db = law_book_db(book)
    law_db.reset() # reset if exists
    if book == 'BGB':
        batch_size = 200
        for i in range(0, len(law_articles), batch_size):
            law_db.add(law_articles[i:i+batch_size])
    else:
        law_db.add(law_articles)    