In [1]:
import getpass
import os

if "AZURE_OPENAI_API_KEY" not in os.environ:
    os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass(
        "Enter your AzureOpenAI API key: "
    )
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://langrag.openai.azure.com/"

In [5]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_deployment="gpt-4",  # or your deployment
    api_version="2024-08-01-preview",  # or your api version
    temperature=0.2,
    max_tokens=50,
    timeout=None,
    max_retries=2,
    # other params...
)

In [3]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

AIMessage(content="J'adore programmer.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 31, 'total_tokens': 37, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4-turbo-2024-04-09', 'system_fingerprint': 'fp_5603ee5e2e', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'se

In [4]:
print(ai_msg.content)


J'adore programmer.


In [6]:
# prompt: write code for creating embedding of "Hello World!"
from langchain_openai import AzureOpenAIEmbeddings
text = "Hello World!"
embeddings = AzureOpenAIEmbeddings(
    deployment="text-embedding-3-small",  # Add your Azure deployment name
    model="text-embedding-3-small",
    azure_endpoint="https://langrag.openai.azure.com/",
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version="2024-02-15-preview"  # Make sure to use the correct API version
)
text_embedding = embeddings.embed_query(text)
text_embedding

[-0.003034229390323162,
 -0.056672804057598114,
 0.029482627287507057,
 0.042976152151823044,
 -0.04082879424095154,
 -0.025202423334121704,
 -0.012789830565452576,
 0.03522825613617897,
 -0.031571947038173676,
 -0.011135785840451717,
 -0.015887537971138954,
 -0.031107652932405472,
 -0.020269306376576424,
 -0.0247381292283535,
 0.029642228037118912,
 0.03586665913462639,
 -0.03813008964061737,
 0.017817256972193718,
 0.011396950110793114,
 0.0407707579433918,
 0.047561049461364746,
 0.0025100859347730875,
 -0.006355015095323324,
 -0.013943308964371681,
 0.0348220020532608,
 -0.01222397293895483,
 -0.044398050755262375,
 0.018499188125133514,
 0.02363543212413788,
 -0.04338240996003151,
 0.045007433742284775,
 -0.036272916942834854,
 -0.010221707634627819,
 0.0059560127556324005,
 0.006220805458724499,
 0.0006257077911868691,
 -0.0016730882925912738,
 0.006442070007324219,
 -0.0014282461488619447,
 -0.023954633623361588,
 0.0121006453409791,
 -0.027756035327911377,
 0.010286998935043812

## Loading the Documents

To load the documents, we use the `DirectoryLoader` class from `langchain.document_loaders`. This class allows us to load all PDF files from a specified directory. Below is the code snippet used to load the documents:


In [7]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
import os

# Define the path to your PDF documents
PDF_DIR = "Data/raw"

# Check if the directory exists, if not create it
if not os.path.exists(PDF_DIR):
    os.makedirs(PDF_DIR)
    print(f"Created directory: {PDF_DIR}")

def load_pdfs(pdf_dir):
    loader = DirectoryLoader(pdf_dir, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

documents = load_pdfs(PDF_DIR)
print(f"Loaded {len(documents)} documents.")

Loaded 15 documents.


## Splitting the Documents into Chunks

In the context of Retrieval-Augmented Generation (RAG), dividing large documents into smaller, more manageable chunks is crucial. RAG models combine retrieval-based and generation-based approaches to improve the quality of generated text by incorporating relevant information from external documents.

By splitting documents into chunks, we can:

### Improve Retrieval Accuracy: 
Smaller chunks allow the retrieval model to find and return more precise and relevant pieces of information, rather than dealing with entire large documents.
### Enhance Generation Quality: 
The generation model can use these precise chunks to produce more accurate and contextually relevant responses.
### Optimize Memory Usage:
Processing smaller chunks helps manage memory constraints, making it feasible to handle large datasets without running into memory issues.

Overall, chunking documents is a key step in the RAG pipeline to ensure efficient and effective retrieval and generation of information.

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

chunks = split_documents(documents)
print(f"Split into {len(chunks)} chunks.")

Split into 52 chunks.


## Generate Embeddings

In [9]:
# Generate embeddings for each chunk
def generate_embeddings(chunks, embeddings):
    """
    Generate embeddings for a list of text chunks.

    Args:
        chunks (list): A list of text chunks, where each chunk is an object with a 'page_content' attribute.
        embeddings (object): An embeddings object with an 'embed_query' method that generates embeddings for a given text.

    Returns:
        list: A list of embeddings corresponding to the input text chunks.
    """
    embeddings_list = []
    for chunk in chunks:
        embedding = embeddings.embed_query(chunk.page_content)
        embeddings_list.append(embedding)
    return embeddings_list

embeddings_list = generate_embeddings(chunks, embeddings)
print("Embeddings generated.")

Embeddings generated.


## Vector Store Setup

In [10]:
from langchain.vectorstores import Chroma

def setup_vectorstore(chunks, embeddings, persist_directory="chroma_db"):
    """
    Sets up a vector store using the provided document chunks and embeddings, and persists it to disk.

    Args:
        chunks (list): A list of document chunks to be stored in the vector store.
        embeddings (object): The embeddings to be used for the document chunks.
        persist_directory (str, optional): The directory where the vector store will be persisted. Defaults to "chroma_db".

    Returns:
        object: The initialized and persisted vector store.
    """
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    return vectorstore

vectorstore = setup_vectorstore(chunks, embeddings)
print("Vector store setup and persisted.")

Vector store setup and persisted.


  vectorstore.persist()


## RAG Pipeline with LLM Integration

In [11]:
from langchain.chains import RetrievalQA



def setup_rag_chain(llm, vectorstore, top_k=3):
    """
    Set up a Retrieval-Augmented Generation (RAG) chain for question answering.

    This function initializes a RetrievalQA chain using the provided language model (llm) and vector store.
    The chain retrieves relevant documents from the vector store and uses the language model to generate answers.

    Args:
        llm: The language model to be used for generating answers.
        vectorstore: The vector store to be used for retrieving relevant documents.
        top_k (int, optional): The number of top documents to retrieve. Defaults to 3.

    Returns:
        RetrievalQA: An instance of the RetrievalQA chain configured with the specified parameters.
    """
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": top_k}),
        return_source_documents=True
    )
    return qa_chain

qa_chain = setup_rag_chain(llm, vectorstore)
print("RAG chain setup.")

RAG chain setup.


## Testing the RAG Pipeline

In [11]:
def query_document(qa_chain, query):
    """
    Queries a document using the provided QA chain.

    Args:
        qa_chain (callable): A function or callable object that processes the query and returns the result.
        query (str): The query string to be processed by the QA chain.

    Returns:
        dict: The result of the query as returned by the QA chain.
    """
    result = qa_chain({"query": query})
    return result

# Example query
query = "What are the main findings of the paper on attention mechanism?"
response = query_document(qa_chain, query)
print("Response:", response['result'])

  result = qa_chain({"query": query})


Response: The main finding of the paper "Attention Is All You Need" is the introduction of the Transformer, a new simple network architecture that is based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. This architecture allows for significantly more parallelization


In [12]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [13]:
# Initialize Conversation Buffer Memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer",
)

  memory = ConversationBufferMemory(


In [14]:
# Setup Conversational RAG Chain with Memory
def setup_conversational_rag_chain(llm, vectorstore, memory, top_k=3):
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(search_kwargs={"k": top_k}),
        memory=memory,
        return_source_documents=True
    )
    return qa_chain

qa_chain = setup_conversational_rag_chain(llm, vectorstore, memory)
print("Conversational RAG chain with memory setup.")

Conversational RAG chain with memory setup.


In [15]:
def query_with_memory(qa_chain, query):
    result = qa_chain({"question": query})
    return result

# Example Queries
queries = [
    "What are the main findings of the paper on attention mechanism?",
    "Can you elaborate on the methodologies used?",
    "How do these findings impact future research?"
]

for query in queries:
    response = query_with_memory(qa_chain, query)
    print(f"Q: {query}\nA: {response['answer']}\n")

  result = qa_chain({"question": query})


Q: What are the main findings of the paper on attention mechanism?
A: The main finding of the paper "Attention Is All You Need" is the introduction of a new simple network architecture called the Transformer. This architecture is based solely on attention mechanisms, and it dispenses with recurrence and convolutions which are commonly used in other

Q: Can you elaborate on the methodologies used?
A: The Transformer architecture uses multi-head attention in three distinct ways:

1. **Encoder-Decoder Attention**: In these layers, the queries come from the previous decoder layer, while the memory keys and values come from the output of the encoder. This setup allows

Q: How do these findings impact future research?
A: I don't have information on the specific impact of the findings from the paper "Attention Is All You Need" on future research.

