# Test LangChain with Azure OpenAI deployment

## Setup

In [None]:
import os
from dotenv import load_dotenv # type: ignore
from langchain_openai import AzureChatOpenAI # type: ignore
from langchain_core.prompts import ChatPromptTemplate # type: ignore

In [None]:
load_dotenv()

## Testing basic model

In [None]:
oai_model = "guiden-gpt-4o-mini"

In [None]:
llm = AzureChatOpenAI(
    azure_deployment=oai_model,
    api_version="2024-02-01",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [None]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "Loretta did nothing wrong."),
]
ai_msg = llm.invoke(messages)
ai_msg

In [None]:
print(ai_msg.content)

## Testing chaining

In [None]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that translates {input_language} to {output_language}.",
        ),
        ("human", "{input}"),
    ]
)

In [None]:
chain = prompt | llm

In [None]:
chain.invoke(
    {
        "input_language": "English",
        "output_language": "Japanese",
        "input": "Loretta did nothing wrong.",
    }
)

## RAG
Goal: Adapt this tutorial to Azure: https://github.com/GaetanHHHHHH/deeplearningai-short-courses/blob/main/langchain_chat_with_your_data/notebooks/03_vectorstores_and_embeddings.ipynb

### Setup + load doc

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from azure.core.credentials import AzureKeyCredential
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import numpy as np
# Move from LangChain doc processor to Document Intelligence:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from langchain_core.documents import Document


### Testing: Document Intelligence loading 

In [None]:
load_dotenv()
oai_model = os.getenv("AZURE_DEPLOYMENT_MODEL")

In [None]:
llm = AzureChatOpenAI(
    azure_deployment=oai_model,
    api_version="2024-02-01",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-3-small",
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBED_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMBED_API_KEY")
)

vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_SERVICE"),
    azure_search_key=os.environ["AZURE_SEARCH_API_KEY"],
    index_name="dataroots-guidelines-vector-index",
    embedding_function=embeddings.embed_query,
    additional_search_client_options={"retry_total": 4},
)

In [None]:
document_client = DocumentAnalysisClient(
    endpoint=os.getenv("AZURE_DOC_INT_ENDPOINT"),
    credential=AzureKeyCredential(os.getenv("AZURE_DOC_INT_API_KEY"))
)

In [None]:
def process_blob_document(container_name: str, blob_name: str):
    """
    Simulates the blob trigger function by processing a blob document file.
    This helps us test our document processing logic without needing Azure Functions.
    """
    blob_service_client = BlobServiceClient.from_connection_string(
            os.getenv("AZURE_STORAGE_CONNECTION_STRING")
        )
    blob_client = blob_service_client.get_container_client(container_name).get_blob_client(blob_name)
    blob_content = blob_client.download_blob().readall()
    
    # Start the document analysis - notice we're using begin_analyze_document
    # instead of begin_analyze_document_from_url since we have a local file
    poller = document_client.begin_analyze_document(
        "prebuilt-document",
        blob_content
    )
    
    result = poller.result()   
    print(f"Processed document: {blob_name}")
    
    return result

In [None]:
def chunk_blob_document(container_name: str, blob_name: str):

    result = process_blob_document(container_name, blob_name)
    
    document_chunks = []
    current_chunk = []
    current_length = 0
    target_chunk_size = 1000
    
    for paragraph in result.paragraphs:
        paragraph_text = paragraph.content
        
        # If adding this paragraph would exceed our target size
        if current_length + len(paragraph_text) > target_chunk_size and current_chunk:
            document_chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
            
        current_chunk.append(paragraph_text)
        current_length += len(paragraph_text)
    
    if current_chunk:
        document_chunks.append(" ".join(current_chunk))
    
    return document_chunks

In [None]:
def embed_and_upload_blob_document(container_name: str, blob_name: str):

    chunks = chunk_blob_document(container_name, blob_name)
    
    documents_to_upload = []
    for i, chunk in enumerate(chunks):
        vector = embeddings.embed_query(chunk)
        
        doc = Document(
            page_content=chunk,
            metadata={
                "id": f"{blob_name.replace('.pdf', '')}-chunk-{i}",
                "vector": vector,
                "source": blob_name,
                "chunk_id": i
            }
        )
        documents_to_upload.append(doc)
        
        if len(documents_to_upload) >= 5:
            vector_store.add_texts(
                texts=[doc.page_content for doc in documents_to_upload],
                metadatas=[doc.metadata for doc in documents_to_upload]
            )
            documents_to_upload = []
            
    if documents_to_upload:
        vector_store.add_texts(
            texts=[doc.page_content for doc in documents_to_upload],
            metadatas=[doc.metadata for doc in documents_to_upload]
        )

In [None]:
container_name = "st-dataroots-guiden-pdfstorage"
test_blob_name = "xmas_project_1.pdf"
extracted_text = process_blob_document(container_name, test_blob_name)

In [None]:
extracted_text

In [None]:
chunks = chunk_blob_document(container_name, test_blob_name)

In [None]:
chunks

In [None]:
embed_and_upload_blob_document(container_name, test_blob_name)

### End Testing: Document Intelligence loading

In [None]:
loader = PyMuPDFLoader("../files/Development of a RAG-Chatbot for Rule and Guideline Retrieval.pdf")

In [None]:
docs = []
docs.extend(loader.load())

In [None]:
docs

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
splits = text_splitter.split_documents(docs)
len(splits)

### Embeddings

In [None]:
embeddings = AzureOpenAIEmbeddings(
    model="guiden-text-embedding-3-small",
    azure_endpoint=os.getenv("AZURE_OPENAI_EMBED_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_EMBED_API_KEY")
)

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embeddings.embed_query(sentence1)
embedding2 = embeddings.embed_query(sentence2)
embedding3 = embeddings.embed_query(sentence3)

In [None]:
np.dot(embedding1, embedding2), np.dot(embedding1, embedding3), np.dot(embedding2, embedding3)

### Vector storage

In [None]:
vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_SERVICE"),
    azure_search_key=os.environ["AZURE_SEARCH_API_KEY"],
    index_name="dataroots-guidelines-vector-index",
    embedding_function=embeddings.embed_query,
    additional_search_client_options={"retry_total": 4},
)

In [None]:
vector_store.add_documents(documents=splits)

#### Similarity search

In [None]:
docs = vector_store.similarity_search(
    query="What is the tech stack of the project?",
    k=3,
    search_type="hybrid",
)

len(docs)

In [None]:
docs[0].page_content

### Retrieval & question answering

#### RetrievalQA chain

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever()
)

In [None]:
query="What is the tech stack of the project?"

In [None]:
result = qa_chain({"query": query})
result["result"]

#### Prompt

In [None]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use five sentences maximum. Keep the answer as concise as possible. Always say "Would you like more information?" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
question = "Are on-prem technologies needed for this project?"

In [None]:
result = qa_chain({"query": question})
result["result"]

In [None]:
result["source_documents"][0]