In [3]:
import requests

from requests_cache import CachedSession
from datetime import timedelta

import os 
import zipfile
import io
from tqdm.notebook import tqdm

session = CachedSession('ris_cache', expire_after=-1)




# Download Data

In [None]:
# List Legislation Documents
base_url = "https://testphase.rechtsinformationen.bund.de"
res = session.get(
    "https://testphase.rechtsinformationen.bund.de/v1/legislation",
    params={
      "searchTerm":"",
      "size": 100,
      "pageIndex": 0,
    }
)

In [None]:
total_items = res.json().get("totalItems")
print("Total items:", total_items)
pages = total_items // 100 + (1 if total_items % 100 > 0 else 0)


for page in range(pages):
    res = session.get(
        "https://testphase.rechtsinformationen.bund.de/v1/legislation",
        params={
            "searchTerm": "",
            "size": 100,
            "pageIndex": page
        }
    )
    print(f"Page {page + 1}/{pages} - Total Items: {res.json().get('totalItems')}")
    # extract url of each item 
    urls = []
    for element in res.json().get("member"):
        urls.append(base_url+element.get("item").get("workExample").get("@id"))

    # extract zip files from each url
    for url in tqdm(urls):
        response = session.get(url)
        zip_url = base_url+response.json().get("workExample").get("encoding")[2]["contentUrl"]

        # check if the url contains a zip file
        if "zip" not in zip_url:
            print(f"Skipping {url} as it does not contain a zip file.")
        else:
            zip = session.get(zip_url)
            path = url.replace(base_url, "")[1:]
            os.makedirs(url.replace(base_url, "")[1:], exist_ok=True)
            z = zipfile.ZipFile(io.BytesIO(zip.content))
            z.extractall(path)

# Init Database

In [None]:
from typing import List
from typing import Optional
from sqlalchemy import ForeignKey
from sqlalchemy import Column, Integer, String, DateTime, Text
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
from pgvector.sqlalchemy import Vector

In [None]:
from db_schema import Document, Article, Paragraph, Base

In [9]:
db_params = {
    'host': '127.0.0.1',
    'database': os.environ.get("DB_NAME", "default_df"),
    'username': os.environ.get("DB_USER", "default_user"),
    'password': os.environ.get("DB_PASSWORD", "default_password"),  # Use environment variable or default
    'port': '5432',
    'drivername': 'postgresql+psycopg2',
    "query":{}
}


In [10]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.engine.url import URL
from sqlalchemy import text

def db_connect():
    """
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    """
    return create_engine(URL(**db_params))
def create_deals_table(engine):
    """"""
    DeclarativeBase.metadata.create_all(engine)
def db_session():
    engine = db_connect()
    Session = sessionmaker(bind=engine)
    session = Session()
    return session
session = db_session()
session.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
session.commit()

In [None]:

session.execute(
    text("DROP TABLE IF EXISTS paragraph;" )
)
session.execute(
    text("DROP TABLE IF EXISTS article;" )
)
session.execute(
    text("DROP TABLE IF EXISTS document;" )
)
session.commit()

In [None]:
def create_tables(engine):
    """"""
    Base.metadata.create_all(engine)

engine = db_connect()
create_tables(engine)

# Insert Data

In [4]:
from lxml import etree


ModuleNotFoundError: No module named 'lxml'

In [None]:
def parse_article(element, document_uri=None):
    guid=element.get("GUID")
    num = element.xpath(".//akn:num", namespaces=element.nsmap)[0].text
    heading =  element.xpath(".//akn:heading", namespaces=element.nsmap)[0].text
    paragraphs = []
    for child in element.xpath(".//akn:paragraph", namespaces=element.nsmap):
        paragraphs.append(
            Paragraph(content= "\n".join([e.text if e.text else "" for e in child.xpath(".//akn:p", namespaces=child.nsmap)]),
             number= child.xpath(".//akn:num", namespaces=child.nsmap)[0].text,
             guid= child.get("GUID", None),
             article_guid=guid)
        )
    return Article(guid=guid, number=num, heading=heading, paragraphs=paragraphs, document_uri=document_uri)

# parse_article(root.getchildren()[0].getchildren()[3].getchildren()[2])

In [None]:
def parse_document(root, session):
    uri = root.xpath("//akn:FRBRExpression//akn:FRBRthis", namespaces=root.nsmap)[0].get("value")
    date = root.xpath("//akn:date", namespaces=root.nsmap)[0].get("date")
    author = root.xpath("//akn:FRBRauthor", namespaces=root.nsmap)[0].get("href")
    long_title = root.xpath("//akn:longTitle//akn:docTitle", namespaces=root.nsmap)[0].text
    short_title = root.xpath("//akn:longTitle//akn:shortTitle", namespaces=root.nsmap)[0].text
    abbreviation = root.xpath("//akn:longTitle//akn:inline", namespaces=root.nsmap)[0].text
    preamble_el = root.xpath("//akn:preamble//akn:p", namespaces=root.nsmap)
    if preamble_el and preamble_el[0].text:
        preamble = preamble_el[0].text
    else:
        preamble = ""
    articles = root.xpath("//akn:body//akn:article", namespaces=root.nsmap)

    parsed_articles = []
    doc = Document(
        uri=uri,
        date=date,
        author=author,
        title=long_title,
        short_title=short_title,
        abbreviation=abbreviation,
        preamble=preamble
    )
    for article in articles:
        parsed_articles.append(parse_article(article, document_uri=uri))
    
    # doc.embedding = embedder(str(doc))
    # Overwrite if the document already exists
    existing_doc = session.query(Document).filter_by(uri=uri).first()
    if existing_doc:
        session.delete(existing_doc)
        session.commit()
    session.add_all([doc] + parsed_articles)
    session.commit()
    return doc
    


In [None]:
stop = False

for root, dirs, files in os.walk("/home/benni/Desktop/RIS RAG/v1"):
    for file in files:
        if "regelungstext-verkuendung" in file:
            xml_path = os.path.join(root, file)
            tree = etree.parse(xml_path)
            xml_root = tree.getroot()
            print(xml_path)
            parse_document(xml_root , session)
                


# Calculate Embeddings

In [None]:
documents = session.query(Document).all()
titles = [doc.title for doc in documents]
embeddings = embedder(titles)

In [None]:
for i,doc in enumerate(documents):
    doc.embedding = embeddings[i]
session.commit()

In [None]:
articles = session.query(Article).all()
repr = [str(a) for a in articles]
embeddings = embedder(repr)

In [None]:
for i, art in enumerate(articles):
    art.embedding = embeddings[i]
session.commit()

# Model Tests

In [None]:
# Load tokens securely from .env file
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access your tokens
api_token = os.getenv('IONOS_TOKEN')

# Example usage
if api_token:
    print("API token loaded securely.")
else:
    print("API token not found. Please set it in your .env file.")

In [None]:
import dspy

llm = dspy.LM("openai/mistralai/Mixtral-8x7B-Instruct-v0.1", base_url="https://openai.inference.de-txl.ionos.com/v1"
        , api_key=api_token,)
llm_llama = dspy.LM("openai/meta-llama/Llama-3.3-70B-Instruct", base_url="https://openai.inference.de-txl.ionos.com/v1"
        , api_key=api_token,)
embedder = dspy.Embedder("openai/BAAI/bge-m3", api_base="https://openai.inference.de-txl.ionos.com/v1", api_key=api_token,)

In [None]:
import requests

# Set your API key and endpoint
endpoint = "https://openai.inference.de-txl.ionos.com/v1/models"

# Make the request to list models
response = requests.get(endpoint, headers={"Authorization": f"Bearer {api_token}"})

# Check if the request was successful
if response.status_code == 200:
    models = response.json()["data"]
    print("Available Models:")
    for model in models:
        print(model["id"])
    # Filter for embeddings models
    embeddings_models = [model for model in models if "embedding" in model["id"]]
    print("Embeddings Models:")
    for model in embeddings_models:
        print(model["id"])
else:
    print(f"Failed to retrieve models: {response.status_code} - {response.text}")