<a href="https://colab.research.google.com/github/Bharath1510/LLM_Apps/blob/main/Rag_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q langchain_community langchainhub chromadb langchain sentence_transformers pypdf langchain-huggingface gradio langchain_chroma > /dev/null 2>&1

In [2]:
import os
from google.colab import userdata
import warnings
warnings.filterwarnings('ignore')

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = userdata.get('Langchain_API_Key')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')

In [3]:
import gradio as gr
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from transformers import pipeline, AutoTokenizer
import torch
from sentence_transformers import SentenceTransformer
from huggingface_hub import login
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory

login(token=os.environ['HUGGINGFACEHUB_API_TOKEN'])
# Initialize the Sentence Transformer embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Load the LLM model
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation",
                model=model_id,
                torch_dtype=torch.bfloat16,
                device_map="cuda",
                pad_token_id=tokenizer.eos_token_id,
                clean_up_tokenization_spaces=True
                )

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [4]:
vectorstore = Chroma(embedding_function=embeddings, persist_directory="sample")
# Initialize memory for conversation tracking
memory = ConversationBufferMemory(input_key="query", output_key="answer")


  memory = ConversationBufferMemory(input_key="query", output_key="answer")


In [None]:
def rewrite_query(query):
    system_rewrite = """You are a helpful assistant that generates multiple search queries based on a single input query.

                        Perform query expansion. If there are multiple common ways of phrasing a user question
                        or common synonyms for key words in the question, make sure to return multiple versions
                        of the query with the different phrasings.

                        If there are acronyms or words you are not familiar with, do not try to rephrase them.

                        Return 3 different versions of the question."""
    # Prompt for rewriting the query
    rewrite_prompt = f"{system_rewrite}\nOriginal Query: {query}\nRewritten Queries:"

    # Generate three rewritten queries
    rewritten_responses = pipe(rewrite_prompt, max_new_tokens=100, num_return_sequences=1)

    # Extract the rewritten queries from the responses
    rewritten_queries = [response['generated_text'].strip() for response in rewritten_responses]
    return rewritten_queries[0].split("Rewritten Queries:")[-1].strip().split("\n")[:3]

# Define the RAG bot function
def rag_bot(query, prompt, max_length):
    # Retrieve relevant documents
    global vectorstore
    #retriever = vectorstore.as_retriever()
    rewritten_queries = rewrite_query(query)

    # Retrieve relevant documents for each rewritten query and combine results
    retrieved_docs = []
    for rewritten_query in rewritten_queries:
        docs =vectorstore.similarity_search(rewritten_query,k = 5)
        retrieved_docs.extend(docs)
        print("Retreived:",rewritten_query,docs)

    # Remove duplicate documents and limit to top 3 results for brevity
    unique_docs = {doc.page_content: doc for doc in retrieved_docs}.values()  # Using doc.id for uniqueness
    #print("Unique:",unique_docs)
    context = "\n".join([doc.page_content for doc in list(unique_docs)])
    #print("Context",context)

    # Integrate conversation memory with current context
    conversation_history = memory.load_memory_variables({})["history"]
    full_context = f"{conversation_history}\nContext: {context}"

    # Format the prompt, including chat history
    input_text = prompt.format(context=full_context, question=query)

    # Generate response with the model
    response = pipe(input_text, max_new_tokens = max_length, num_return_sequences=1, truncation=True)

    # Save the new query and response to memory
    answer = response[0]['generated_text'].split("Answer:")[-1].strip()
    memory.save_context({"query": query}, {"answer": answer})

    return answer

# Function to load and process uploaded files
def load_files(files, chunk_size, chunk_overlap):
    global vectorstore
    vectorstore = Chroma(embedding_function=embeddings, persist_directory="sample")

    # Load and split each document into chunks, then store embeddings
    for file in files:
        loader = PyPDFLoader(file.name)
        documents = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(#separators=[". ", "? ", "! "],
                                                       chunk_size=chunk_size,
                                                       chunk_overlap=chunk_overlap)
        chunks = text_splitter.split_documents(documents)

        # Embed and store the document chunks
        vectorstore.add_documents(documents=chunks)

    return "Success"

# Function to clear all embeddings from Chroma
def clear_embeddings():
    global vectorstore
    vectorstore.delete_collection()  # Explicitly clear all entries from the current Chroma instance

    # Reset the Chroma collection with a fresh database
    vectorstore = Chroma(embedding_function=embeddings, persist_directory="sample")
    vectorstore.reset_collection()

    return "All embeddings cleared from Chroma."


# Default prompt template
default_prompt = """
Please answer the question using the provided context. Your response should be clear and understandable.

If the question is not related or there is no answer available in context, politely respond that you are not allowed answer questions that are unrelated to the context.

Context: {context}

Question: {question}

Answer:
"""

# Gradio interface setup
with gr.Blocks() as app:
    gr.Markdown("# RAG Chatbot with Document Upload and Memory")

    # Left column for parameter settings and file upload
    with gr.Row():
        with gr.Column(scale=0.5):
            chunk_size_input = gr.Number(value=512, label="Chunk Size", precision=0)
            chunk_overlap_input = gr.Number(value=20, label="Chunk Overlap", precision=0)
            max_length_input = gr.Number(value=2048, label="Max New Tokens", precision=0)

            files_input = gr.File(label="Upload PDFs", file_count="multiple")
            upload_button = gr.Button("Process Documents")
            clear_button = gr.Button("Clear All Embeddings")
            clear_memory_button = gr.Button("Clear Memory")  # Add button to clear memory
            status_text = gr.Textbox(label="Status", placeholder="Upload documents to process or clear embeddings.")

            upload_button.click(load_files, inputs=[files_input, chunk_size_input, chunk_overlap_input], outputs=[status_text])
            clear_button.click(clear_embeddings, outputs=[status_text])

        with gr.Column(scale=1):
            prompt_input = gr.Textbox(value=default_prompt, label="Prompt", max_lines=5)

            chatbot = gr.Chatbot(label="Chatbot")

            query_input = gr.Textbox(label="Your Query", placeholder="Ask a question...")
            submit_button = gr.Button("Get Answer")


            # Define a function to handle the chat interaction
            def handle_chat(query, prompt, max_length, chat_history):
                answer = rag_bot(query, prompt, max_length)
                chat_history.append((query, answer))
                return "", chat_history

            # Clear memory function
            def clear_memory():
                memory.clear()
                return "Memory cleared"

            submit_button.click(handle_chat, inputs=[query_input, prompt_input, max_length_input, chatbot], outputs=[query_input, chatbot])
            clear_memory_button.click(clear_memory, outputs=[status_text])  # Connect clear memory button to function

# Launch the Gradio app
app.launch(debug=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://53be3042f60cc8228e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
