In [1]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from pprint import PrettyPrinter

In [2]:
import os
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

# Embeddings & LLM models
llm = AzureChatOpenAI(
    openai_api_base="https://free-cdo.openai.azure.com/openai/deployments/cod-free-gpt4o/chat/completions?api-version=2024-02-15-preview",
    openai_api_version="2024-02-15-preview",
    openai_api_type="azure",
    temperature=0.2,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

embeddings = AzureOpenAIEmbeddings(    
    openai_api_base="https://cdo-aac-dev-open.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2023-05-15",
    openai_api_version="2023-05-15",
    openai_api_type="azure",
    openai_api_key=os.getenv("OPENAI_API_KEY_EMBED")
)

In [3]:
# !ollama list

#!ollama pull llama3

In [4]:
#!ollama rm llama3.1:8b

In [5]:
filename = "01_contract.pdf"

file_path = f"data/{filename}"

In [6]:
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
import pdfplumber

class PDFPlumberLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        documents = []
        with pdfplumber.open(self.file_path) as pdf:
            for page_number, page in enumerate(pdf.pages):
                text = page.extract_text()
                metadata = {
                    "source": self.file_path,
                    "page": page_number + 1,
                    "total_pages": len(pdf.pages)
                }
                documents.append(Document(page_content=text, metadata=metadata))
        return documents

In [7]:
# from docling.document_converter import DocumentConverter

# converter = DocumentConverter()
#result = converter.convert(source = file_path)

In [8]:
#text = result.document.export_to_markdown()

#print(text)

In [9]:
# from typing import Iterator
# from langchain_core.document_loaders import BaseLoader
# from langchain_core.documents import Document
# from docling.document_converter import DocumentConverter

# class DoclingPDFLoader(BaseLoader):

#     def __init__(self, file_path: str | list[str]) -> None:
#         self._file_paths = file_path if isinstance(file_path, list) else [file_path]
#         self._converter = DocumentConverter()

#     def lazy_load(self) -> Iterator[Document]:
#         for source in self._file_paths:
#             dl_doc = self._converter.convert(source).document
#             text = dl_doc.export_to_markdown()
#             yield Document(page_content=text)

In [10]:
	
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = PDFPlumberLoader(file_path=file_path)
#loader = DoclingPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
)

In [11]:
docs = loader.load()

chunks = text_splitter.split_documents(docs)

In [12]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

In [13]:
# vectorstore = Chroma.from_documents(
#         documents=chunks, 
#         embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=False),
#         collection_name="local-rag"
# )

In [14]:
retriever = vectorstore.as_retriever()

In [15]:
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser

class OutputStructure(BaseModel):
    input_text: str = Field(description="The input text", example="Wer ist der Ehepartner?", default="Wer ist der Ehepartner?")
    response: str = Field(description="The response to the query", example="Der Name des Ehepartners ist nicht bekannt.", default="Der Name des Ehepartners ist nicht bekannt.")
    reasoning: str = Field(description="The reasoning behind the response", example="Der Name des Ehepartners ist nicht im Vertrag angegeben.", default="Der Name des Ehepartners ist nicht im Vertrag angegeben.")
 
parser = JsonOutputParser(pydantic_object=OutputStructure)

In [16]:
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="""
    You are an AI legal assistant. You are being provided a marriage contract in German language. 
    Answer all questions in German and please give also a reasoning for your response.
    Here is the question {question} you have to answer based on the following contract context {context}.
    Limit your answer to 1000 characters.\n {format_instructions}.
    """,
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [17]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# llm = ChatOllama(model="llama3", temperature=0.5)

In [18]:
# retriever = MultiQueryRetriever.from_llm(
#     vectorstore.as_retriever(), 
#     llm,
#     # prompt=prompt,
# )

In [19]:
# chain = (
#     {"context": lambda x: retriever.get_relevant_documents(x["question"]), 
#      "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | parser
# )

# chain = prompt | llm | StrOutputParser()

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

In [21]:
question = "Wie heißt der Notar?"

# result = chain.invoke({"question": question})
result = await chain.ainvoke({"question": question})

PrettyPrinter().pprint(result)

TypeError: argument 'text': 'dict' object cannot be converted to 'PyString'

In [22]:
question_text = "Wie heißt der Notar?"

# Process all chunks in parallel using batch
results = chain.batch(
     [{"context": chunk.page_content, "question": question_text} for chunk in chunks], 
      #{"max_concurrency": 3}
)

# results = await chain.abatch(
#     [{"context": chunk.page_content, "question": question_text} for chunk in chunks], 
#     {"max_concurrency": 10}
# )

TypeError: argument 'text': 'dict' object cannot be converted to 'PyString'

In [43]:
#response = chain.invoke({"question": "Wie heißt der Notar?"})

# PrettyPrinter().pprint(results)