# Documentation for bot.py

### Explanation

- `import os`  : Import the os module for operating system functionality

- `from dotenv import load_dotenv` : Import load_dotenv to load environment variables from a `.env` file

- `from langchain.chains.combine_documents import create_stuff_documents_chain` : Import function for combining documents

- `from langchain.schema import Document` : Import Document schema from langchain

- `from langchain_core.prompts import PromptTemplate` : Import prompt template for generating prompts

- `from langchain_mistralai.chat_models import ChatMistralAI` : Import ChatMistralAI model for conversation

- `from langchain_milvus import Milvus` : Import Milvus for vector storage

- `from langchain_community.document_loaders import WebBaseLoader, RecursiveUrlLoader` : Import loaders for web documents

- `from langchain_text_splitters import RecursiveCharacterTextSplitter` : Import splitter for text documents

- `from langchain.chains import create_retrieval_chain` : Import function to create a retrieval chain

- `from langchain_huggingface import HuggingFaceEmbeddings` : Import HuggingFace embeddings

- `from pymilvus import connections, utility` : Import Milvus connection and utility functions

- `from requests.exceptions import HTTPError`  : Import HTTPError for handling HTTP exceptions

- `from httpx import HTTPStatusError` : Import HTTPStatusError for HTTP status exceptions

- Load environment variables from `.env` file

`load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")  # Get MISTRAL API key from environment variables`

- Set USER_AGENT environment variable if not already set

`if not os.getenv("USER_AGENT"): 
    os.environ["USER_AGENT"] = "my_custom_user_agent"  

- `MILVUS_URI = "./milvus/milvus_vector.db"` : Define the URI for the Milvus database

- `MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"` : Specify the model name for embeddings

- `CORPUS_SOURCE = 'https://dl.acm.org/doi/proceedings/10.1145/3597503'` : Define the source of documents

- Print statement summarizing the purpose of the code
    `print("Initializes environment, loads documents, sets embeddings.")`  # Output a summary of initialization actions


In [2]:
import os
from dotenv import load_dotenv
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.schema import Document
from langchain_core.prompts import PromptTemplate
#from langchain_mistralai import MistralAIEmbeddings
from langchain_mistralai.chat_models import ChatMistralAI
#from langchain_cohere import ChatCohere
from langchain_milvus import Milvus
from langchain_community.document_loaders import WebBaseLoader, RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain_huggingface import HuggingFaceEmbeddings
from pymilvus import connections, utility
from requests.exceptions import HTTPError
from httpx import HTTPStatusError

# Load environment variables
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

# Set USER_AGENT environment variable if not already set
if not os.getenv("USER_AGENT"):
    os.environ["USER_AGENT"] = "my_custom_user_agent"  # You can customize this string

MILVUS_URI = "./milvus/milvus_vector.db"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
CORPUS_SOURCE = 'https://dl.acm.org/doi/proceedings/10.1145/3597503'

print("Initializes environment, loads documents, sets embeddings.")

Initializes environment, loads documents, sets embeddings.


### Explanation

- `import os` - Import the os module for operating system functionality

- `os.environ['TQDM_DISABLE'] = '1' ` : Suppress tqdm warnings by disabling its output

- `from langchain_huggingface import HuggingFaceEmbeddings` : Import HuggingFaceEmbeddings for embedding generation

- `MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"` : Specify the model name for embeddings

- Create an embedding function using the specified model

   `def get_embedding_function():
    """
    Returns embedding function for the model.

    Returns:
        embedding function
    """
    embedding_function = HuggingFaceEmbeddings(model_name=MODEL_NAME)`
    
- Output a summary of the function's purpose
    print("Returns Hugging Face model embedding function.")  

- Return the created embedding function
    return embedding_function 

- Call the function to trigger the print statement, get the embedding function and store the returned embedding function in a variable
`embedding_function = get_embedding_function()  
print(f"Embedding function created with model: {MODEL_NAME}")`  


In [8]:
import os
os.environ['TQDM_DISABLE'] = '1'  # Suppress tqdm warnings

from langchain_huggingface import HuggingFaceEmbeddings

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

def get_embedding_function():
    """
    Returns embedding function for the model.

    Returns:
        embedding function
    """
    embedding_function = HuggingFaceEmbeddings(model_name=MODEL_NAME)
    
    print("Returns Hugging Face model embedding function.")
    
    return embedding_function

# Call the function to trigger the print statement and get the embedding function
embedding_function = get_embedding_function()

print(f"Embedding function created with model: {MODEL_NAME}")

Returns Hugging Face model embedding function.
Embedding function created with model: sentence-transformers/all-MiniLM-L6-v2


- `from langchain_mistralai.chat_models import ChatMistralAI` : Import ChatMistralAI for generating responses

- `from httpx import HTTPStatusError` : Import HTTPStatusError for handling HTTP errors

- `MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"` : Define the model name for embeddings

- `MILVUS_URI = "./milvus/milvus_vector.db"` : Specify the URI for the local Milvus database

- Placeholder function for creating a prompt
`def create_prompt():`

- Return a prompt template
    `return "Provide a detailed summary of the latest research on: {input}"` 

- Placeholder function for loading the vector store
   `def load_exisiting_db(uri):`
    
- Return itself for retrieval

   `class VectorStore:
        def as_retriever(self):
            return self`

- Return an instance of the VectorStore class
    return VectorStore() 

- Placeholder function for creating a document chain
   `def create_stuff_documents_chain(model, prompt):`

- Return a placeholder for the document chain
    `return "Document chain here."`

- Placeholder function for creating a retrieval chain
    `def create_retrieval_chain(retriever, document_chain):`

- Return a lambda function simulating a retrieval process
    `return lambda x: { 
        "context": [  # Example context with metadata sources
            {"metadata": {"source": "https://example.com/research_paper1"}},
            {"metadata": {"source": "https://example.com/research_paper2"}}
        ],`
        "answer": "Generated response"  # Placeholder answer
    }

 def query_rag(query):
    """
    Entry point for the RAG model to generate an answer to a given query

    This function initializes the RAG model, sets up the necessary components such as the prompt template, vector store, 
    
    retriever, document chain, and retrieval chain, and then generates a response to the provided query.

    Args:
        query (str): The query string for which an answer is to be generated.
    
    Returns:
        str: The answer to the query
    """
    
- Define the model
    `model = ChatMistralAI(model='open-mistral-7b')`  : Instantiate the ChatMistralAI model

    `print("Model Loaded")` : Print confirmation of model loading
    
    `prompt = create_prompt()` : Create a prompt for the model

- Load the vector store and create the retriever

    `vector_store = load_exisiting_db(uri=MILVUS_URI)` : Load the existing vector store


    `retriever = vector_store.as_retriever()`  : Get the retriever from the vector store
    
    try:

  `document_chain = create_stuff_documents_chain(model, prompt)` : Create the document chain

  `print("Document Chain Created")` : Print confirmation of document chain creation

  `retrieval_chain = create_retrieval_chain(retriever, document_chain)` : Create the retrieval chain

  `print("Retrieval Chain Created")` : Print confirmation of retrieval chain creation
    
- Generate a response to the query and invoke the retrieval chain with the query
        response = retrieval_chain({"input": f"{query}"})

    except HTTPStatusError as e:

  `print(f"HTTPStatusError: {e}") `: Print any HTTP errors encountered

  `if e.response.status_code == 429:` : Check for high traffic error

   `return "I am currently experiencing high traffic. Please try again later.", []` : Return a message for high traffic

   `return f"HTTPStatusError: {e}", []` : Return the error message if other error occurs
    
- Logic to add sources to the response

   `max_relevant_sources = 4`  : Set the maximum number of sources to add to the response

    `all_sources = ""` : Initialize string to hold all sources

   `sources = []`  : Initialize list to hold unique sources

    `count = 1 ` : Initialize a counter for source numbering
    
    `for i in range(max_relevant_sources):` : Loop through the maximum number of sources

        `try:
            source = response["context"][i]["metadata"]["source"]`  :  Get the source from the response

  - Check if the source is already added to the list
    `if source not in sources:` : If source is not already in the list

    `sources.append(source)`  : Add the source to the list
    
    `all_sources += f"[Source {count}]({source}), "` : Format the source for output

    `count += 1` : Increment the source count
  
    `except IndexError:` : If there are no more sources to add

    `break` : Exit the loop if an IndexError occurs
    
   `all_sources = all_sources[:-2]`  : Remove the last comma and space from the sources string

    `response["answer"] += f"\n\nSources: {all_sources}"` : Append the sources to the response answer

    ` print("Response Generated")` : Print confirmation of response generation

    `print("Initializes components, retrieves documents, generates response.")` : Print summary of process

    `return response["answer"], sources`  : Return the generated answer and list of sources

# Example usage
- query = "Latest research on machine learning in healthcare"  : Define a sample query
  
- answer, sources = query_rag(query) : Call the query_rag function with the sample query

- print(answer) : Print the generated answer

In [11]:
from langchain_mistralai.chat_models import ChatMistralAI
from httpx import HTTPStatusError

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MILVUS_URI = "./milvus/milvus_vector.db"

def create_prompt():
    # Placeholder function for creating a prompt
    return "Provide a detailed summary of the latest research on: {input}"

def load_exisiting_db(uri):
    # Placeholder function for loading the vector store
    class VectorStore:
        def as_retriever(self):
            return self
    return VectorStore()

def create_stuff_documents_chain(model, prompt):
    # Placeholder function for creating a document chain
    return "Document chain here."

def create_retrieval_chain(retriever, document_chain):
    # Placeholder function for creating a retrieval chain
    return lambda x: {"context": [{"metadata": {"source": "https://example.com/research_paper1"}}, {"metadata": {"source": "https://example.com/research_paper2"}}], "answer": "Generated response"}

def query_rag(query):
    """
    Entry point for the RAG model to generate an answer to a given query

    This function initializes the RAG model, sets up the necessary components such as the prompt template, vector store, 
    retriever, document chain, and retrieval chain, and then generates a response to the provided query.

    Args:
        query (str): The query string for which an answer is to be generated.
    
    Returns:
        str: The answer to the query
    """
    # Define the model
    model = ChatMistralAI(model='open-mistral-7b')
    print("Model Loaded")

    prompt = create_prompt()

    # Load the vector store and create the retriever
    vector_store = load_exisiting_db(uri=MILVUS_URI)
    retriever = vector_store.as_retriever()
    try:
        document_chain = create_stuff_documents_chain(model, prompt)
        print("Document Chain Created")

        retrieval_chain = create_retrieval_chain(retriever, document_chain)
        print("Retrieval Chain Created")
    
        # Generate a response to the query
        response = retrieval_chain({"input": f"{query}"})
    except HTTPStatusError as e:
        print(f"HTTPStatusError: {e}")
        if e.response.status_code == 429:
            return "I am currently experiencing high traffic. Please try again later.", []
        return f"HTTPStatusError: {e}", [] 
    
    # Logic to add sources to the response
    max_relevant_sources = 4  # number of sources at most to be added to the response
    all_sources = ""
    sources = []
    count = 1
    for i in range(max_relevant_sources):
        try:
            source = response["context"][i]["metadata"]["source"]
            # Check if the source is already added to the list
            if source not in sources:
                sources.append(source)
                all_sources += f"[Source {count}]({source}), "
                count += 1
        except IndexError:  # if there are no more sources to add
            break
    all_sources = all_sources[:-2]  # remove the last comma and space
    response["answer"] += f"\n\nSources: {all_sources}"
    print("Response Generated")

    print("Initializes components, retrieves documents, generates response.")

    return response["answer"], sources

# Example usage
query = "Latest research on machine learning in healthcare"
answer, sources = query_rag(query)
print(answer)

Model Loaded
Document Chain Created
Retrieval Chain Created
Response Generated
Initializes components, retrieves documents, generates response.
Generated response

Sources: [Source 1](https://example.com/research_paper1), [Source 2](https://example.com/research_paper2)


### Explanation:
- `create_vector_store` function is designed to manage a vector store using a Milvus database.

def create_vector_store(docs, embeddings, uri):
    """
    This function initializes a vector store using the provided documents and embeddings.
    It connects to a local Milvus database specified by the URI. If a collection named "research_paper_chatbot" already exists,
    it loads the existing vector store; otherwise, it creates a new vector store and drops any existing one.

    Args:
        docs (list): A list of documents to be stored in the vector store.
        embeddings : A function or model that generates embeddings for the documents.
        uri (str): Path to the local milvus db

    Returns:
        vector_store: The vector store created
    """

- Create the directory if it does not exist
    `head = os.path.split(uri)` : Split the URI to get the directory path

   `os.makedirs(head[0], exist_ok=True)`  : Create the directory if it doesn't exist

    `print("Directory created for vector store if it did not exist")` : Print confirmation of directory creation

- Connect to the Milvus database

    `connections.connect("default", uri=uri)` : Establish a connection to the Milvus database

    `print("Connected to the Milvus database")` : Print confirmation of database connection

- Check if the collection already exists

- `if utility.has_collection("research_paper_chatbot"):` - Check for existing collection

- `print("Collection already exists. Loading existing Vector Store.")` : Print collection status

- `vector_store = Milvus` : Load existing vector store

-  Get the embedding function

  `collection_name="research_paper_chatbot",
            embedding_function=get_embedding_function()`
            
- `connection_args={"uri": uri}` - Connection parameters

     `print("Existing Vector Store Loaded")` : Print confirmation of loading existing store
- else:
  `vector_store = Milvus.from_documents` : Create a new vector store from documents

  `documents=docs,` : Documents to store
  
   `embedding=embeddings,`  : Embedding function for documents

   `collection_name="research_paper_chatbot",` : Name of the collection

   `connection_args={"uri": uri},` : Connection parameters
  
- `drop_old=True,` : Drop old collection if exists
        )
- `print("New Vector Store Created with provided documents")` - Print confirmation of new store creation

- `return vector_store` : Return the created or loaded vector store

def load_exisiting_db(uri=MILVUS_URI):
    """
    Load an existing vector store from the local Milvus database specified by the URI.

    Args:
        uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.

    Returns:
        vector_store: The vector store created
    """

- `vector_store = Milvus(`  : Load the vector store

- `collection_name="research_paper_chatbot",` : Name of the collection

- `embedding_function=get_embedding_function(),` : Get the embedding function

- `connection_args={"uri": uri},` Connection parameters
    )
    
- `print("Loaded existing Vector Store from Milvus database")` : Print confirmation of store loading

- `return vector_store` : Return the loaded vector store

`if __name__ == '__main__':`:  Load documents from the web
    
- `print("Loading documents from the web...")` : Print message before loading documents

- `documents = load_documents_from_web()` : Load documents from the web

- `print(f"Loaded {len(documents)} documents from the web.")` : Print number of documents loaded

- Split the documents into chunks
- `print("Splitting documents into chunks...")` : Print message before splitting documents

- `docs = split_documents(documents)` : Split loaded documents into chunk
  
- `print(f"Split into {len(docs)} chunks.")` : Print number of chunks created

- Get the embedding function
    `print("Getting embedding function...")`
      
- `embeddings = get_embedding_function()`  : Retrieve the embedding function

- Define the URI for the Milvus database Assign the URI for the Milvus database

    `uri = MILVUS_URI`
  
- Call the functions to see print statements

- `print("Creating vector store...")` : Print message before creating vector store

- `vector_store = create_vector_store(docs, embeddings, uri)` : Create the vector store

- `print("Loading existing vector store...")`  : Print message before loading existing store

    `loaded_vector_store = load_exisiting_db(uri)`  : Load the existing vector store

- `print("Finished operations.")` : Print message indicating completion of operations


In [16]:
def create_vector_store(docs, embeddings, uri):
    """
    This function initializes a vector store using the provided documents and embeddings.
    It connects to a local Milvus database specified by the URI. If a collection named "research_paper_chatbot" already exists,
    it loads the existing vector store; otherwise, it creates a new vector store and drops any existing one.

    Args:
        docs (list): A list of documents to be stored in the vector store.
        embeddings : A function or model that generates embeddings for the documents.
        uri (str): Path to the local milvus db

    Returns:
        vector_store: The vector store created
    """
    # Create the directory if it does not exist
    head = os.path.split(uri)
    os.makedirs(head[0], exist_ok=True)
    print("Directory created for vector store if it did not exist")

    # Connect to the Milvus database
    connections.connect("default", uri=uri)
    print("Connected to the Milvus database")

    # Check if the collection already exists
    if utility.has_collection("research_paper_chatbot"):
        print("Collection already exists. Loading existing Vector Store.")
        vector_store = Milvus(
            collection_name="research_paper_chatbot",
            embedding_function=get_embedding_function(),
            connection_args={"uri": uri}
        )
        print("Existing Vector Store Loaded")
    else:
        vector_store = Milvus.from_documents(
            documents=docs,
            embedding=embeddings,
            collection_name="research_paper_chatbot",
            connection_args={"uri": uri},
            drop_old=True,
        )
        print("New Vector Store Created with provided documents")
    return vector_store


def load_exisiting_db(uri=MILVUS_URI):
    """
    Load an existing vector store from the local Milvus database specified by the URI.

    Args:
        uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.

    Returns:
        vector_store: The vector store created
    """
    vector_store = Milvus(
        collection_name="research_paper_chatbot",
        embedding_function=get_embedding_function(),
        connection_args={"uri": uri},
    )
    print("Loaded existing Vector Store from Milvus database")
    return vector_store


if __name__ == '__main__':
    # Load documents from the web
    print("Loading documents from the web...")
    documents = load_documents_from_web()  # Load documents
    print(f"Loaded {len(documents)} documents from the web.")

    # Split the documents into chunks
    print("Splitting documents into chunks...")
    docs = split_documents(documents)  # Ensure that docs is a list of documents
    print(f"Split into {len(docs)} chunks.")

    # Get the embedding function
    print("Getting embedding function...")
    embeddings = get_embedding_function()

    # Define the URI for the Milvus database
    uri = MILVUS_URI  

    # Call the functions to see print statements
    print("Creating vector store...")
    vector_store = create_vector_store(docs, embeddings, uri)

    print("Loading existing vector store...")
    loaded_vector_store = load_exisiting_db(uri)

    print("Finished operations.")

Loading documents from the web...
Loaded 1 documents from the web.
Splitting documents into chunks...
Split into 6 chunks.
Getting embedding function...
Returns Hugging Face model embedding function.
Creating vector store...
Directory created for vector store if it did not exist
Connected to the Milvus database
Collection already exists. Loading existing Vector Store.
Returns Hugging Face model embedding function.
Existing Vector Store Loaded
Loading existing vector store...
Returns Hugging Face model embedding function.
Loaded existing Vector Store from Milvus database
Finished operations.
