# Textbook Chatbot 

The Textbook Chatbot project for CSE 6550 is designed to assist with queries related to the textbook."Software Engineering: A Practitioner's Approach." The chatbot serves as an educational tool, helping users by providing information, answering questions, and possibly retrieving content from the textbook.

## Tabel of contents 

-- write the contents 

## Setup and imports 


In [12]:
#installing the libraries 
!pip install langchain langchain_community langchain-mistralai langchain-huggingface


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-huggingface
  Obtaining dependency information for langchain-huggingface from https://files.pythonhosted.org/packages/9d/f8/77a303ddc492f6eed8bf0979f2bc6db4fa6eb1089c5e9f0f977dd87bc9c2/langchain_huggingface-0.1.2-py3-none-any.whl.metadata
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
  Obtaining dependency information for langchain-huggingface from https://files.pythonhosted.org/packages/19/0a/5c3c0fbed6a0c82949e72950f0b11c5c9b6f7eb7cf2d208df7a90f36d481/langchain_huggingface-0.1.1-py3-none-any.whl.metadata
  Downloading langchain_huggingface-0.1.1-py3-none-any.whl.metadata (1.3 kB)
  Obtaining dependency information for langchain-huggingface from https://files.pythonhosted.org/packages/39/ce/ad7f50a6289cf562


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Document loading 

In [None]:
# Import necessary libraries for interacting with vector stores, 
# retrieving relevant information from text data, and loading documents
import os

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader


In [14]:
# document loading 
EMBEDDING_MODEL_NAME = "Alibaba-NLP/gte-large-en-v1.5"  # Embedding model (https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5)
model_kwargs = {'trust_remote_code': True}
EMBEDDING_FUNCTION = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs=model_kwargs)

def load_documents_from_directory(
	document_path: str, 
	chunk_size: int = 2048, 
	chunk_overlap: int = 200
):
	"""
	Load PDF documents from a directory and split them into chunks.
	Args:
		document_path (str): Path to the directory containing PDF files.
		chunk_size (int): Size of each text chunk (default: 2048).
		chunk_overlap (int): Overlap between chunks (default: 200).
	Returns:
		List of document chunks.
	"""
	print(f"Loading documents from {document_path}...")
	# Load PDF documents from the specified directory
	documents = PyPDFDirectoryLoader(document_path).load_and_split()
	# Create a text splitter using tiktoken encoder
	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	# Split the documents into chunks
	return text_splitter.split_documents(documents)


def load_or_create_faiss_vector_store(
	documents,
	persist_directory,
	collection_name="collection"
):
	"""
	Load an existing FAISS vector store or create a new one if it doesn't exist.
	Args:
			documents: List of documents to be indexed.
			collection_name (str): Name of the collection.
			persist_directory (str): Directory to save/load the FAISS index.
	Returns:
			FAISS vector store object.
	"""
	index_path = os.path.join(persist_directory, f'{collection_name}')
	if os.path.exists(index_path):
		# Load existing FAISS index
		print(f"Loading existing FAISS vector store from {index_path}...\n")
		faiss_store = FAISS.load_local(
			index_path, 
			embeddings=EMBEDDING_FUNCTION, 
			allow_dangerous_deserialization=True
		)
	else:
		# Create new FAISS index
		print(f"Creating new FAISS vector store in {index_path}...\n")
		faiss_store = FAISS.from_documents(
			documents, 
			embedding=EMBEDDING_FUNCTION
		)
		faiss_store.save_local(index_path)
	return faiss_store

def similarity_search(
	question,
	vector_store,
	k,
	distance_threshold = 420.0
):
	"""
	Get top k most similar documents using FAISS vector store.
	Args:
		question: The user question
		vector_store: FAISS vector store
		k: Number of documents to return
		distance_threshold: Maximum distance score to include document
	Returns:
		list[Document]: Top k most similar documents
	"""
	retrieved_docs = vector_store.similarity_search_with_score(question, k=k)
	filtered_docs = [doc for doc, score in retrieved_docs if score <= distance_threshold]
	return filtered_docs

def get_hybrid_retriever(documents, vector_store, k):
	"""
	Create a hybrid retriever combining BM25 and vector search.
	Args:
		documents: List of documents for BM25 retriever.
		vector_store: FAISS vector store for vector retriever.
		k (int): Number of documents to retrieve.
	Returns:
		EnsembleRetriever object combining BM25 and vector search.
	"""
	# Create BM25 retriever
	bm25_retriever = BM25Retriever.from_documents(
		documents, 
		k = 0
	)
	# Create vector retriever
	vector_retriever = vector_store.as_retriever(
		search_type="similarity",
		search_kwargs={
			'k': k,
		}
	)
	# Combine retrievers with specified weights
	fusion_retriever = EnsembleRetriever(
		retrievers=[bm25_retriever, vector_retriever],
		weights=[0.2, 0.8]
	)
	return fusion_retriever

Need to compile C++ extensions to get sparse attention suport. Please run python setup.py build develop


Could not find module 'C:\Users\pavan\AppData\Roaming\Python\Python310\site-packages\xformers\_C.pyd' (or one of its dependencies). Try using the full path with constructor syntax.


## Prompt template

#### Description::

- Imports `ChatPromptTemplate` : Facilitates structured conversation prompts for efficient chatbot interactions.

- Defines system instructions: Establishes rules for accurate, concise responses about software engineering.

- Limits response length: Restricts answers to 256 tokens for user clarity.

- Clarifies chatbot identity: Ensures users know they are interacting with a chatbot.

- Encourages clarification requests: Promotes seeking additional information for ambiguous or unclear questions.

- Creates prompt template: Structures the conversation flow between chatbot and user effectively.

- Describes chatbot function: Provides a clear explanation of the chatbot's purpose and scope.

In [None]:
from langchain_core.prompts import ChatPromptTemplate 

# Prompts
system_prompt = """  # Defines a multi-line string containing system instructions for the chatbot.
You are a chatbot answering questions about "Software Engineering: A Practitioner's Approach" textbook.  # Specifies the chatbot's context and focus area.

1. Always identify yourself as a chatbot, not the textbook.  # Instructs the chatbot to clarify its identity.
2. Answer based only on provided context.  # Emphasizes using only the relevant context for responses.
3. If unsure, say "I don't have enough information to answer."  # Guides the chatbot on handling uncertainty in answers.
4. For unclear questions, ask for clarification.  # Encourages the chatbot to seek more information for ambiguous questions.
5. Keep responses under 256 tokens.  # Sets a limit on response length for conciseness.
6. Don't invent information.  # Instructs the chatbot to refrain from generating unsupported information.
7. Use context only if relevant.  # Advises the chatbot to incorporate context judiciously.
8. To questions about your purpose, say: "I'm a chatbot designed to answer questions about the 'Software Engineering: A Practitioner's Approach' textbook."  # Provides a standard response for inquiries about the chatbot's function.

Be accurate and concise. Answer only what's asked.  # Reinforces the importance of precision and relevance in responses.
"""

# Create the chat prompt template
prompt = ChatPromptTemplate.from_messages([  # Creates a chat prompt template from the defined messages.
    ("system", system_prompt),  # Sets the system prompt as the first message.
    ("human", "Question: {input}\n\nRelevant Context:\n{context}"),  # Defines the human user input format.
])

def get_chatbot_prompt_description():  # Defines a function that returns a description of the chatbot prompt.
    return "Chatbot prompt for answering textbook-related questions."  # Returns a brief description of the chatbot's purpose.

# Calls the function to get the prompt description.
output = get_chatbot_prompt_description()  
print(output)  # Prints the description of the chatbot prompt.

## Enviroment Setup


- Imports modules for SWEBOK document retrieval and processing.

- Loads environment variables for SWEBOK chatbot configuration.

- Loads SWEBOK documents and manages FAISS vector storage.

- Creates hybrid retriever for SWEBOK knowledge querying.

- Retrieves Mistral API key for chatbot integration.

- Handles errors if paths or API keys are missing.

In [None]:
import os
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from dotenv import load_dotenv

# Load environment variables from .env file, overriding any existing values.
load_dotenv(override=True)
def load_embeddings():
    """
    Load documents and embeddings from FAISS store.

    Returns:
        retriever: Hybrid retriever for querying SWEBOK-aligned documents.
    """

    # Load environment variables, retrieve path to SWEBOK corpus, and check if it's specified.
    document_path = os.getenv("CORPUS_SOURCE")
    if not document_path:
        raise ValueError("CORPUS_SOURCE not found in environment variables.")

    # Set directory path for FAISS index persistence, specify number of top relevant documents, load documents from SWEBOK corpus directory, and check if any were loaded.
    persist_directory = os.path.join(document_path, "faiss_indexes")
    top_k = 15
    documents = load_documents_from_directory(document_path)
    if not documents:
        raise ValueError("No documents loaded. Please check the document path.")

    # Create or load FAISS vector store for embeddings and top_k retriever, get hybrid retriever for querying SWEBOK documents, and return it.
    faiss_store = load_or_create_faiss_vector_store(documents, persist_directory)
    retriever = get_hybrid_retriever(documents, faiss_store, top_k)

    # Indicate embeddings and retrieval setup completion for SWEBOK chatbot.
    print("Embeddings and retriever loaded.")

    return retriever

def get_api_key():
    """
    Get Mistral API Key from environment.

    Returns:
        str: The API key for SWEBOK chatbot connection.
    """

    # Load Mistral API key from environment variable, check if it's provided, and return it.
    api_key = os.getenv("MISTRAL_API_KEY")
    if not api_key:
        raise ValueError("MISTRAL_API_KEY not found in environment variables.")

    return api_key


# Example usage
if __name__ == "__main__":
    try:
        # Load embeddings for SWEBOK content querying, retrieve API key for SWEBOK chatbot connection, print successful API key retrieval message, and return.
        retriever = load_embeddings()
        api_key = get_api_key()
        print(f"Successfully retrieved API Key: {api_key}")
    except ValueError as e:
        # Output error if loading SWEBOK configuration fails.
        print(f"Error: {e}")


### RAG chain implementation

- Function usage: Generates answers using Retrieval-Augmented Generation (RAG) chain

- Question logging: Prints the question being processed for debugging

- Chain creation: Sets up question-answer and retrieval chains

- Response initialization: Initializes a dictionary for storing answers and context

- Response streaming: Streams answers and context from the RAG chain

- Output structure: Returns complete answer and model name in dictionary

In [None]:
# Defines a function that generates a response in dictionary format.
from langchain_mistralai import ChatMistralAI
def load_llm_api(model_name):
	"""
	Load and configure the Mistral AI LLM.
	Returns:
		ChatMistralAI: Configured LLM instance.
	"""
	return ChatMistralAI(
		model=model_name,
		mistral_api_key=api_key,
		temperature=0.2,
		max_tokens=256,
		top_p=0.4,
	)
MODEL_NAME = "open-mistral-7b"
llm = load_llm_api(MODEL_NAME)

def chat_completion_as_dict(question):  
    """
    Generate a response to a given question using the RAG chain,
    returning only the answer in a dictionary.

    Args:
        question (str): The user question to be answered.

    Returns:
        dict: A dictionary containing the answer and model name.
    """
    print(f"Running prompt: {question}")  # Prints the incoming question for debugging purposes.

    question_answer_chain = create_stuff_documents_chain(llm, prompt)  # Creates a chain for answering questions using the specified model and prompt.
    
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)  # Creates a retrieval-augmented generation (RAG) chain for enhanced responses.

    full_response = {"answer": "", "context": []}  # Initializes a dictionary to hold the final answer and context.

    for chunk in rag_chain.stream({"input": question}):  # Streams the response from the RAG chain for the provided question.
        if "answer" in chunk:  # Checks if the current chunk contains an answer.
            full_response["answer"] += chunk["answer"]  # Appends the answer to the full_response dictionary.

        if "context" in chunk:  # Checks if the current chunk contains context information.
            full_response["context"].extend(chunk["context"])  # Extends the context list with additional context from the chunk.

    
    # final_answer = get_answer_with_source(full_response) 

    remaining_answer = full_response["answer"]  # Extracts the final answer from the full_response dictionary.

    # Return the response without sources and context
    return {
        "complete_answer": remaining_answer,  # Returns the complete answer as part of the response.
        "model": MODEL_NAME  # Returns the name of the model used for generating the answer.
    }

# Example usage
if __name__ == "__main__":  # Ensures the following code runs only if this script is executed directly.
    question = "What are the benefits of Retrieval-Augmented Generation?"  # Defines a sample question for testing.
    
    response = chat_completion_as_dict(question)  # Calls the function with the sample question to generate a response.
    
    print(f"Response: {response['complete_answer']}\nModel: {response['model']}")  # Prints the complete answer and model name.