<a href="https://colab.research.google.com/github/Finlay-J/PDF-RAG/blob/main/End_to_End_RAG_UVICAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# File QA Rag Chatbot App with Gemini

Following features:
1. PDF document upload and indexing
2. RAG system for query analysis and response
3. Result streaming capabilities
4. Show sources

## Pip install

In [None]:
!pip install langchain==0.1.12
!pip install sentence-transformers   # includes Hugging Face + Accelerate
!pip install langchain_openai==0.0.8
!pip install langchain-google-genai==0.0.8
!pip install langchain_community==0.0.29
!pip install streamlit==1.32.2
!pip install PyMuPDF==1.24.0
!pip install chromadb==0.4.24
!pip install pyngrok==7.1.5
!pip install pytesseract

## Put API key in YAML file

In [6]:
from google.colab import userdata
from google import genai
import yaml, pathlib
gemini_key  = userdata.get("GeminiAPI")   # what you already called key

# build the structure you want in the file
secrets_dict = {
    "gemini":  {"api_key": gemini_key},
}

# save it to disk
with open("secrets.yaml", "w") as f:
    yaml.safe_dump(secrets_dict, f, sort_keys=False)

## Write app.py

In [None]:
%%writefile app.py

### Imports
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.callbacks.base import BaseCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from operator import itemgetter
import streamlit as st
import tempfile
import os
import pandas as pd
import torch
import yaml, pathlib

# imports I used to deal with image-based pdfs, won't be used in codealong
import pytesseract, io
from PIL import Image
import fitz


### API key ###
api_key = yaml.safe_load(pathlib.Path("secrets.yaml").read_text())["gemini"]["api_key"]
# alternatively, api_key = [insert API key here]

### USER MODIFIABLE CODE ###

@st.cache_resource(ttl="1h") # Caches the object returned for 1 hr
def configure_retriever(uploaded_files):
  ''' Takes uploaded pdfs, creates document chunks, computes embeddings
      Stores document chunks and embeddings into a Vector DB
      Returns a retriever which can look up the vector Db
      to return documents based on user input
      Stores this in cache

      Known errors: some pdfs are image based. Implementation followed today will
      not cover how to deal with those, though it's not extraordinarily difficult.
      It's one extra step between step 1 and step 2.
  '''

  # Step 1: load files
  docs = []
  temp_dir = tempfile.TemporaryDirectory()
  for file in uploaded_files:
    temp_filepath = os.path.join(temp_dir.name, file.name)
    with open(temp_filepath, "wb") as f:
      f.write(file.getvalue())

      #if its a img no worky
      loader = PyMuPDFLoader(temp_filepath)
      docs.extend(loader.load())

  # Step 2: Split files into chunks
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=200)
  docs_chunks = text_splitter.split_documents(docs)


  # Step 3: Vectorize/embedd chunks
  # BAAI/bge-base-en-v1.5
  model_name = "sentence-transformers/all-mpnet-base-v2" #arbitrary model
  embeddings_model = HuggingFaceEmbeddings(
      model_name=model_name,
      model_kwargs={"device" : "cuda" if torch.cuda.is_available() else "cpu"},
      encode_kwargs={"batch_size": 32, "normalize_embeddings": True},
  )

  # Step 4: Return VectorDB
  vectordb = Chroma.from_documents(docs_chunks, embeddings_model)
  retriever = vectordb.as_retriever() #implicitly calculate cosine similarities in backend when called
  return retriever


### Misc ###
st.set_page_config(page_title="File QA Chatbot")
st.title("Welcome to File QA RAG Chatbot")

# Creates UI element to accept PDF uploads
uploaded_files = st.sidebar.file_uploader(
    label="Upload PDF files", type=["pdf"],
    accept_multiple_files=True
)
if not uploaded_files:
  st.info("Please upload PDF documents to continue")
  st.stop()

gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.1,
    max_retries=2,
    streaming=True,
    google_api_key=api_key,
)

qa_template = """
.

{context}

question:
{question}
"""

###  Streamlit helper classes: ###
class StreamHandler(BaseCallbackHandler):
  ''' manages live updates to a streamlit app's display by appending new text tokens
      to an existing text stream and rendinering the updated text in markdown'''
  def __init__(self, container, initial_text=""):
    self.container = container
    self.text = initial_text

  def on_llm_new_token(self, token: str, **kwargs) -> None:
    self.text += token
    self.container.markdown(self.text)

  def on_llm_end(self, response, **kwargs):
    # If nothing streamed, display the final text now
    if not self.text:
      self.text = response.generations[0][0].text
      self.container.markdown(self.text)


class PostMessageHandler(BaseCallbackHandler):
  ''' Callback handler which does some post-processing on LLM response
    Used to post the top 3 document sources used by the LLm in RAG response.
  '''
  def __init__(self, msg: st.write):
    BaseCallbackHandler.__init__(self)
    self.msg = msg
    self.sources = []

  def on_retriever_end(self, documents, *, run_id, parent_run_id, **kwargs):
    source_ids = []
    for d in documents: # retrieved documents from retriever based on user query
      metadata = {
          "source": d.metadata["source"],
          "page": d.metadata["page"],
          "content": d.page_content[:200]
      }

      idx = (metadata["source"], metadata["page"])
      if idx not in source_ids:
        source_ids.append(idx)
        self.sources.append(metadata)

  def on_llm_end(self, response, *, run_id, parent_run_id, **kwargs):
    if len(self.sources):
      st.markdown("__Sources:__" + "\n")
      st.dataframe(data=pd.DataFrame(self.sources[:3]),
                   width=1000)

qa_prompt = ChatPromptTemplate.from_template(qa_template)
retriever = configure_retriever(uploaded_files)

def format_docs(docs):
  return "\n\n".join([d.page_content for d in docs])

qa_rag_chain = (
    {
        "context": itemgetter("question") # based on user question get context docs
          |
        retriever
          |
        format_docs,
        "question": itemgetter("question") # user question
    }
      |
    qa_prompt
      |
    gemini

)

# store conversation history in Streamlit session state
streamlit_msg_history = StreamlitChatMessageHistory(key="langchain_messages")

if len(streamlit_msg_history.messages) == 0:
  streamlit_msg_history.add_ai_message("Please ask your question")

# render current messages from streamlitChatMessageHistory
for msg in streamlit_msg_history.messages:
  st.chat_message(msg.type).write(msg.content)

# if user inputs a new prompt, display it and show response
if user_prompt := st.chat_input():
  st.chat_message("human").write(user_prompt)
  with st.chat_message("ai"):
    token_box = st.empty()
    stream_handler = StreamHandler(token_box)
    sources_box = st.container()
    pm_handler = PostMessageHandler(sources_box)

    qa_rag_chain.invoke(
        {"question": user_prompt},
        {"callbacks": [stream_handler, pm_handler]},
    )

## Run App

In [30]:
!streamlit run app.py --server.port=8989 &>.logs.txt &

In [None]:
from pyngrok import ngrok
import yaml
from google.colab import userdata

ngrok.kill()

NGROK_AUTH_TOKEN = userdata.get('ngrok_auth_token')
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

ngrok_tunnel = ngrok.connect(8989)
print("Streamlit App:", ngrok_tunnel.public_url)