<a href="https://colab.research.google.com/github/Anubh-debug/learning_LLMs/blob/gradio_apps/constitution_gradio_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U -q datasets
!pip install -q PyPDF2
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
%pip install -U -q langchain-openai
%pip install -q chromadb

In [None]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
from PyPDF2 import PdfReader
# PdfReader converts pdf document into text.
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain.chat_models import init_chat_model
from google.colab import userdata
import getpass
import os
from langchain_openai import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import ChatPromptTemplate

download indian constitution from here: https://legislative.gov.in/constitution-of-india/ and save it on your google drive

Reading the file

In [None]:
pages = []
reader = PdfReader("/content/drive/MyDrive/indian_constitution_eng.pdf")
number_of_pages = len(reader.pages)

for i in range(number_of_pages):
  page = reader.pages[i]
  text = page.extract_text()
  pages.append(text)

# appending all the pages into a single page
one_page=''
for page in pages:
  one_page += ' ' + page

splitting text with textsplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
texts = text_splitter.split_text(one_page)
print(f"Number of chunks: {len(texts)}")

Initialize CHAT model

In [None]:
open_ai_key = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = open_ai_key
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

initialize embedding model

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embedded_chunks = embeddings.embed_documents(texts)

Initializing chroma database

In [None]:
db_name = "chroma_IN_constitution"
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
# Convert the list of strings to a list of Document objects
documents = [Document(page_content=text) for text in texts]

vectorstore = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG. We will retrieve top 5 relevant chunks
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

SYSTEM_MESSAGE = "You are an Indian constitution expert."

# Corrected prompt template structure for ConversationalRetrievalChain
# Added {context} placeholder for retrieved documents
custom_prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_MESSAGE + "\n\nHere is some relevant context:\n{context}"),
    ("human", "{chat_history}\n{question}"),
])

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": custom_prompt}
)

In [None]:
# Testing our chain
# query = "Tell me about president powers"
# result = conversation_chain.invoke({"question":query})
# print(result["answer"])

In [None]:
import gradio as gr

def chat(message, history):
    result = conversation_chain.invoke({"question": message})
    return result["answer"]


view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)