# Documentation for bot.py

### Import libaries and setup environment variables

- Imports essential modules for functionality
  
- Loads environment variables from a `.env` file

- Sets MISTRAL API key from environment variables

- Defines the URI for the Milvus database

- Specifies model name for sentence embeddings

- Prints initialization summary message to console

In [2]:
import os  # Imports the os module for interacting with the operating system
from dotenv import load_dotenv  # Imports load_dotenv to manage environment variables from a .env file
from langchain.chains.combine_documents import create_stuff_documents_chain  # Imports function for combining document chains
from langchain.schema import Document  # Imports Document class for document representation
from langchain_core.prompts import PromptTemplate  # Imports PromptTemplate for structured prompts
#from langchain_mistralai import MistralAIEmbeddings  # (Commented out) Placeholder for MistralAI embeddings integration
from langchain_mistralai.chat_models import ChatMistralAI  # Imports ChatMistralAI for chat model integration
#from langchain_cohere import ChatCohere  # (Commented out) Placeholder for Cohere chat model integration
from langchain_milvus import Milvus  # Imports Milvus for vector database integration
from langchain_community.document_loaders import WebBaseLoader, RecursiveUrlLoader  # Imports document loaders for web content
from langchain_text_splitters import RecursiveCharacterTextSplitter  # Imports text splitter for managing document sizes
from langchain.chains import create_retrieval_chain  # Imports function to create a retrieval chain for documents
from langchain_huggingface import HuggingFaceEmbeddings  # Imports Hugging Face embeddings integration
from pymilvus import connections, utility  # Imports connections and utility functions for Milvus database
from requests.exceptions import HTTPError  # Imports HTTPError for handling HTTP exceptions
from httpx import HTTPStatusError  # Imports HTTPStatusError for handling HTTP status exceptions

# Load environment variables
load_dotenv()  # Loads environment variables from a .env file into the environment
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")  # Retrieves MISTRAL_API_KEY from environment variables

# Set USER_AGENT environment variable if not already set
if not os.getenv("USER_AGENT"):  # Checks if USER_AGENT is not set
    os.environ["USER_AGENT"] = "my_custom_user_agent"  # Sets a custom user agent string for HTTP requests

MILVUS_URI = "./milvus/milvus_vector.db"  # Specifies the URI for the Milvus vector database
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Sets the model name for sentence embeddings
CORPUS_SOURCE = 'https://dl.acm.org/doi/proceedings/10.1145/3597503'  # Sets the source URL for the document corpus

print("Initializes environment, loads documents, sets embeddings.")  # Prints a message indicating initialization steps

Initializes environment, loads documents, sets embeddings.


### Creating Hugging Face Embedding Function

- Imports HuggingFaceEmbeddings for model embedding functionality
  
- Defines model name for sentence embeddings

- Creates and returns embedding function for specified model

- Prints confirmation of embedding function creation

In [8]:
import os  # Imports the os module for interacting with the operating system
os.environ['TQDM_DISABLE'] = '1'  # Suppress tqdm warnings by setting environment variable

from langchain_huggingface import HuggingFaceEmbeddings  # Imports HuggingFaceEmbeddings for embedding functionality

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Defines the model name for embeddings

def get_embedding_function():
    """
    Returns embedding function for the model.

    Returns:
        embedding function
    """
    embedding_function = HuggingFaceEmbeddings(model_name=MODEL_NAME)  # Creates embedding function using specified model
    
    print("Returns Hugging Face model embedding function.")  # Print statement to confirm the embedding function is created
    
    return embedding_function  # Returns the created embedding function

# Call the function to trigger the print statement and get the embedding function
embedding_function = get_embedding_function()  # Calls the function and stores the resulting embedding function

print(f"Embedding function created with model: {MODEL_NAME}")  # Prints confirmation of the embedding function creation along with the model name

Returns Hugging Face model embedding function.
Embedding function created with model: sentence-transformers/all-MiniLM-L6-v2


### Query response generation

- Initializes ChatMistralAI model for query processing.

- Defines functions for prompt and vector store management.

- Creates document and retrieval chains for responses.

- Generates answers based on user queries and sources.

- Handles HTTP errors during query processing.

- Returns generated answers with associated source links

In [11]:
from langchain_mistralai.chat_models import ChatMistralAI  # Imports the ChatMistralAI model for conversation generation
from httpx import HTTPStatusError  # Imports HTTPStatusError for handling HTTP-related exceptions

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # Defines the model name for embeddings
MILVUS_URI = "./milvus/milvus_vector.db"  # Specifies the URI for the Milvus vector database

def create_prompt():
    # Placeholder function for creating a prompt
    return "Provide a detailed summary of the latest research on: {input}"  # Returns a prompt template for querying

def load_exisiting_db(uri):
    # Placeholder function for loading the vector store
    class VectorStore:  # Defines a local class for the vector store
        def as_retriever(self):  # Method to return itself as a retriever
            return self  # Returns the instance of VectorStore
    return VectorStore()  # Returns an instance of VectorStore

def create_stuff_documents_chain(model, prompt):
    # Placeholder function for creating a document chain
    return "Document chain here."  # Returns a string indicating where the document chain would be

def create_retrieval_chain(retriever, document_chain):
    # Placeholder function for creating a retrieval chain
    return lambda x: {  # Returns a lambda function to generate a response
        "context": [  # Contextual information with metadata for sources
            {"metadata": {"source": "https://example.com/research_paper1"}}, 
            {"metadata": {"source": "https://example.com/research_paper2"}}
        ], 
        "answer": "Generated response"  # Sample generated answer
    }

def query_rag(query):
    """
    Entry point for the RAG model to generate an answer to a given query.

    This function initializes the RAG model, sets up the necessary components such as the prompt template, vector store, 
    retriever, document chain, and retrieval chain, and then generates a response to the provided query.

    Args:
        query (str): The query string for which an answer is to be generated.
    
    Returns:
        str: The answer to the query
    """
    # Define the model
    model = ChatMistralAI(model='open-mistral-7b')  # Initializes the ChatMistralAI model
    print("Model Loaded")  # Print statement confirming the model has loaded

    prompt = create_prompt()  # Calls the function to create the prompt

    # Load the vector store and create the retriever
    vector_store = load_exisiting_db(uri=MILVUS_URI)  # Loads the existing vector store
    retriever = vector_store.as_retriever()  # Calls method to get the retriever from the vector store
    
    try:
        document_chain = create_stuff_documents_chain(model, prompt)  # Creates a document chain
        print("Document Chain Created")  # Print confirmation for document chain creation

        retrieval_chain = create_retrieval_chain(retriever, document_chain)  # Creates a retrieval chain
        print("Retrieval Chain Created")  # Print confirmation for retrieval chain creation
    
        # Generate a response to the query
        response = retrieval_chain({"input": f"{query}"})  # Calls the retrieval chain with the query input
    except HTTPStatusError as e:  # Catches HTTP errors
        print(f"HTTPStatusError: {e}")  # Print the error message
        if e.response.status_code == 429:  # Checks for rate limit error
            return "I am currently experiencing high traffic. Please try again later.", []  # Returns message for high traffic
        return f"HTTPStatusError: {e}", []  # Returns error message for other HTTP errors
    
    # Logic to add sources to the response
    max_relevant_sources = 4  # Defines maximum number of sources to include
    all_sources = ""  # Initialize string to hold all source links
    sources = []  # Initialize list to hold unique sources
    count = 1  # Initialize counter for source numbering

    for i in range(max_relevant_sources):  # Loop over the number of maximum relevant sources
        try:
            source = response["context"][i]["metadata"]["source"]  # Retrieves source from response context
            # Check if the source is already added to the list
            if source not in sources:  # If source is unique
                sources.append(source)  # Add source to the list
                all_sources += f"[Source {count}]({source}), "  # Append formatted source link to all_sources
                count += 1  # Increment the source counter
        except IndexError:  # Handle case where there are no more sources
            break  # Exit the loop if no more sources are available
            
    all_sources = all_sources[:-2]  # Remove the last comma and space from all_sources
    response["answer"] += f"\n\nSources: {all_sources}"  # Append all sources to the answer
    print("Response Generated")  # Print confirmation of response generation

    print("Initializes components, retrieves documents, generates response.")  # Summary of what the function does

    return response["answer"], sources  # Return the generated answer and list of sources

# Example usage
query = "Latest research on machine learning in healthcare"  # Defines an example query
answer, sources = query_rag(query)  # Calls the query_rag function with the example query
print(answer)  # Prints the generated answer

Model Loaded
Document Chain Created
Retrieval Chain Created
Response Generated
Initializes components, retrieves documents, generates response.
Generated response

Sources: [Source 1](https://example.com/research_paper1), [Source 2](https://example.com/research_paper2)


### Explanation:
- `create_vector_store` function is designed to manage a vector store using a Milvus database.

def create_vector_store(docs, embeddings, uri):
    """
    This function initializes a vector store using the provided documents and embeddings.
    It connects to a local Milvus database specified by the URI. If a collection named "research_paper_chatbot" already exists,
    it loads the existing vector store; otherwise, it creates a new vector store and drops any existing one.

    Args:
        docs (list): A list of documents to be stored in the vector store.
        embeddings : A function or model that generates embeddings for the documents.
        uri (str): Path to the local milvus db

    Returns:
        vector_store: The vector store created
    """

- Create the directory if it does not exist
    `head = os.path.split(uri)` : Split the URI to get the directory path

   `os.makedirs(head[0], exist_ok=True)`  : Create the directory if it doesn't exist

    `print("Directory created for vector store if it did not exist")` : Print confirmation of directory creation

- Connect to the Milvus database

    `connections.connect("default", uri=uri)` : Establish a connection to the Milvus database

    `print("Connected to the Milvus database")` : Print confirmation of database connection

- Check if the collection already exists

- `if utility.has_collection("research_paper_chatbot"):` - Check for existing collection

- `print("Collection already exists. Loading existing Vector Store.")` : Print collection status

- `vector_store = Milvus` : Load existing vector store

-  Get the embedding function

  `collection_name="research_paper_chatbot",
            embedding_function=get_embedding_function()`
            
- `connection_args={"uri": uri}` - Connection parameters

     `print("Existing Vector Store Loaded")` : Print confirmation of loading existing store
- else:
  `vector_store = Milvus.from_documents` : Create a new vector store from documents

  `documents=docs,` : Documents to store
  
   `embedding=embeddings,`  : Embedding function for documents

   `collection_name="research_paper_chatbot",` : Name of the collection

   `connection_args={"uri": uri},` : Connection parameters
  
- `drop_old=True,` : Drop old collection if exists
        )
- `print("New Vector Store Created with provided documents")` - Print confirmation of new store creation

- `return vector_store` : Return the created or loaded vector store

def load_exisiting_db(uri=MILVUS_URI):
    """
    Load an existing vector store from the local Milvus database specified by the URI.

    Args:
        uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.

    Returns:
        vector_store: The vector store created
    """

- `vector_store = Milvus(`  : Load the vector store

- `collection_name="research_paper_chatbot",` : Name of the collection

- `embedding_function=get_embedding_function(),` : Get the embedding function

- `connection_args={"uri": uri},` Connection parameters
    )
    
- `print("Loaded existing Vector Store from Milvus database")` : Print confirmation of store loading

- `return vector_store` : Return the loaded vector store

`if __name__ == '__main__':`:  Load documents from the web
    
- `print("Loading documents from the web...")` : Print message before loading documents

- `documents = load_documents_from_web()` : Load documents from the web

- `print(f"Loaded {len(documents)} documents from the web.")` : Print number of documents loaded

- Split the documents into chunks
- `print("Splitting documents into chunks...")` : Print message before splitting documents

- `docs = split_documents(documents)` : Split loaded documents into chunk
  
- `print(f"Split into {len(docs)} chunks.")` : Print number of chunks created

- Get the embedding function
    `print("Getting embedding function...")`
      
- `embeddings = get_embedding_function()`  : Retrieve the embedding function

- Define the URI for the Milvus database Assign the URI for the Milvus database

    `uri = MILVUS_URI`
  
- Call the functions to see print statements

- `print("Creating vector store...")` : Print message before creating vector store

- `vector_store = create_vector_store(docs, embeddings, uri)` : Create the vector store

- `print("Loading existing vector store...")`  : Print message before loading existing store

    `loaded_vector_store = load_exisiting_db(uri)`  : Load the existing vector store

- `print("Finished operations.")` : Print message indicating completion of operations


In [16]:
def create_vector_store(docs, embeddings, uri):
    """
    This function initializes a vector store using the provided documents and embeddings.
    It connects to a local Milvus database specified by the URI. If a collection named "research_paper_chatbot" already exists,
    it loads the existing vector store; otherwise, it creates a new vector store and drops any existing one.

    Args:
        docs (list): A list of documents to be stored in the vector store.
        embeddings : A function or model that generates embeddings for the documents.
        uri (str): Path to the local milvus db

    Returns:
        vector_store: The vector store created
    """
    # Create the directory if it does not exist
    head = os.path.split(uri)
    os.makedirs(head[0], exist_ok=True)
    print("Directory created for vector store if it did not exist")

    # Connect to the Milvus database
    connections.connect("default", uri=uri)
    print("Connected to the Milvus database")

    # Check if the collection already exists
    if utility.has_collection("research_paper_chatbot"):
        print("Collection already exists. Loading existing Vector Store.")
        vector_store = Milvus(
            collection_name="research_paper_chatbot",
            embedding_function=get_embedding_function(),
            connection_args={"uri": uri}
        )
        print("Existing Vector Store Loaded")
    else:
        vector_store = Milvus.from_documents(
            documents=docs,
            embedding=embeddings,
            collection_name="research_paper_chatbot",
            connection_args={"uri": uri},
            drop_old=True,
        )
        print("New Vector Store Created with provided documents")
    return vector_store


def load_exisiting_db(uri=MILVUS_URI):
    """
    Load an existing vector store from the local Milvus database specified by the URI.

    Args:
        uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.

    Returns:
        vector_store: The vector store created
    """
    vector_store = Milvus(
        collection_name="research_paper_chatbot",
        embedding_function=get_embedding_function(),
        connection_args={"uri": uri},
    )
    print("Loaded existing Vector Store from Milvus database")
    return vector_store


if __name__ == '__main__':
    # Load documents from the web
    print("Loading documents from the web...")
    documents = load_documents_from_web()  # Load documents
    print(f"Loaded {len(documents)} documents from the web.")

    # Split the documents into chunks
    print("Splitting documents into chunks...")
    docs = split_documents(documents)  # Ensure that docs is a list of documents
    print(f"Split into {len(docs)} chunks.")

    # Get the embedding function
    print("Getting embedding function...")
    embeddings = get_embedding_function()

    # Define the URI for the Milvus database
    uri = MILVUS_URI  

    # Call the functions to see print statements
    print("Creating vector store...")
    vector_store = create_vector_store(docs, embeddings, uri)

    print("Loading existing vector store...")
    loaded_vector_store = load_exisiting_db(uri)

    print("Finished operations.")

Loading documents from the web...
Loaded 1 documents from the web.
Splitting documents into chunks...
Split into 6 chunks.
Getting embedding function...
Returns Hugging Face model embedding function.
Creating vector store...
Directory created for vector store if it did not exist
Connected to the Milvus database
Collection already exists. Loading existing Vector Store.
Returns Hugging Face model embedding function.
Existing Vector Store Loaded
Loading existing vector store...
Returns Hugging Face model embedding function.
Loaded existing Vector Store from Milvus database
Finished operations.
