### Loading the Keys

In [None]:
import os
from dotenv import load_dotenv
load_dotenv('azure.env',override = True)

### Importing the libraries

In [None]:
from langchain_openai import AzureOpenAIEmbeddings,AzureChatOpenAI
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.schema import Document
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.schema.runnable import RunnableMap
from langchain_core.runnables import RunnableConfig
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.memory import PostgresChatMessageHistory
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from azure.ai.documentintelligence.models import DocumentAnalysisFeature
from langchain_community.document_loaders.doc_intelligence import AzureAIDocumentIntelligenceLoader

### Uploading the document through Azure Document Intelligence

In [None]:
api_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
api_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
# print(f"API Key: {api_key}, Type: {type(api_key)}")
# print(f"API Endpoint: {api_endpoint}, Type: {type(api_endpoint)}")

In [None]:
loader = AzureAIDocumentIntelligenceLoader(file_path=r'C:\Users\nag\Documents\Microsoft\Customer data\Infy_Helix_Data\multi_page_table.pdf', 
                                           api_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY"), 
                                           api_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"),
                                           api_model="prebuilt-layout",
                                           api_version="2024-02-29-preview",
                                           mode='markdown',
                                           analysis_features = [DocumentAnalysisFeature.OCR_HIGH_RESOLUTION])
docs = loader.load()

In [None]:
from IPython.display import display, Markdown, Latex
display(Markdown(docs[-1].page_content))

### Split the document into chunks base on markdown headers.

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
    ("######", "Header 6"),  
    ("#######", "Header 7"), 
    ("########", "Header 8")
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
docs_result = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(docs_result)))

In [None]:
docs_result[6].metadata

### Character Splitter to Split based on Chunk Size as well as image

In [None]:
from __future__ import annotations

import re
from typing import Any, List, Optional

from langchain_text_splitters.base import Language, TextSplitter

class CustomCharacterTextSplitter(TextSplitter):
    """Splitting text that looks at characters."""

    def __init__(
        self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
    ) -> None:
        """Create a new TextSplitter."""
        super().__init__(**kwargs)
        self._separator = separator
        self._is_separator_regex = is_separator_regex

    def split_text(self, text: str) -> List[str]:
        """Split incoming text and return chunks."""
        # First we naively split the large input into a bunch of smaller ones.
        separator = (
            self._separator if self._is_separator_regex else re.escape(self._separator)
        )
        splits = re.split(separator, text, flags=re.DOTALL) 
        splits = [part for part in splits if part.strip()]
        return splits

text_splitter = CustomCharacterTextSplitter(separator=r'(<figure>.*?</figure>)', is_separator_regex=True)
child_docs  = text_splitter.split_documents(docs_result)
print("Length of splits: " + str(len(child_docs)))

### Load the LangChain OpenAI Embedding Class

In [None]:
aoai_embeddings = AzureOpenAIEmbeddings(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2024-03-01-preview",
    azure_endpoint =os.environ["AZURE_OPENAI_ENDPOINT"]
)

### Create the Azure AI Search Index Structure

In [None]:
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights
)
embedding_function = aoai_embeddings.embed_query
fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_profile_name="myHnswProfile",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=False,
    ),
    # Additional field to store the title
    SearchableField(
        name="header",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="image",
        type=SearchFieldDataType.String,
        filterable=False,
        searchable=False,
    ),
]

### Create the AI Search Index

In [None]:
index_name: str = "langchain-vector-demo-custom3"

vector_store_multi_modal: AzureSearch = AzureSearch(
    azure_search_endpoint=os.environ["AZURE_SEARCH_ENDPOINT"],
    azure_search_key=os.environ["AZURE_SEARCH_KEY"],
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
)

In [None]:
import re
import json
def find_figure_indices(text):
    pattern = r'!\[\]\(figures/(\d+)\)'
    matches = re.findall(pattern, text)
    indices = [int(match) for match in matches]
    return indices

### Ingest the chunks into Index

In [None]:
image_metadata = docs[-1].metadata['images']
lst_docs = []
for doc in child_docs:
    figure_indices = find_figure_indices(doc.page_content)
    if figure_indices:
        for figure_indice in figure_indices:
            image = image_metadata[figure_indice]
            doc_result = Document(page_content = doc.page_content, metadata={"header": json.dumps(doc.metadata), "source": "multi_page_table.pdf", "image": image})
            lst_docs.append(doc_result)
    else:
        doc_result = Document(page_content = doc.page_content, metadata={"header": json.dumps(doc.metadata), "source": "multi_page_table.pdf", "image": None})
        lst_docs.append(doc_result)
vector_store_multi_modal.add_documents(documents=lst_docs)

In [None]:
index = await FAISS.afrom_documents(documents=child_docs, embedding=aoai_embeddings)
retriever_base = index.as_retriever(search_type="similarity",search_kwargs = {"k" : 5})

### Lets do the RAG Now

### Load the AOAI Chat Class from LangChain

In [None]:
llm = AzureChatOpenAI(api_key = os.environ["AZURE_OPENAI_API_KEY"],  
                      api_version = "2024-06-01",
                      azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
                      model= os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                      streaming=True)
llm([HumanMessage("Hi")])

### Multi Modal RAG (Ingestion Side Only)

In [None]:
from operator import itemgetter
prompt = hub.pull("rlm/rag-prompt")
from langchain.schema.runnable import RunnableMap

def format_docs(docs):
    to_return =  "\n\n".join(str(doc.metadata) + "\n" + doc.page_content for doc in docs)
    return to_return
    
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableMap(
    {"documents": retriever_base, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}
rag_chain_with_source.invoke("Does Quality consultant has Controlling field office write permission?")

In [None]:
rag_chain_with_source.invoke("Does Quality Consultant has any write permission")

In [None]:
rag_chain_with_source.invoke("what permissions does Implementation Manager has in terms of write")

### Multi Modal RAG (Both Ingestion Side + Calling Side)

In [None]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

def get_image_text(docs):
    b64_images = []
    texts = []
    for doc in docs:
        if doc.metadata['image']:
            b64_images.append(doc.metadata['image'])
        else:
            texts.append(doc.page_content)
    return {"images": b64_images, "texts": texts}

def img_prompt_func(data_dict):
    """
    Join the context into a single string
    """
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = []
    # Adding the text for analysis
    text_message = {
        "type": "text",
        "text": (
            "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n"
            "You will be given a mixed of text, tables, and image(s) usually of charts or graphs.\n"
            f"User-provided question: {data_dict['question']}\n\n"
            "Text and / or tables:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)
    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        for image in data_dict["context"]["images"]:
            image_message = {
                "type": "image_url",
                "image_url": {"url": f"{image}"},
            }
            messages.append(image_message)
    return [HumanMessage(content=messages)]

retriever_multi_modal = vector_store_multi_modal.as_retriever(search_type="similarity")

# RAG pipeline
chain_multimodal_rag = (
    {
        "context": retriever_multi_modal | RunnableLambda(get_image_text),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(img_prompt_func)
    | llm
    | StrOutputParser()
)
chain_multimodal_rag.invoke("Which component are part of RLHF shown in green dash lines?") 