In [None]:
!pip3 install langchain langchain_community sentence-transformers pypdf langchain_chroma google.generativeai langchain-google-genai streamlit pyngrok



In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma

In [None]:
# Load PDF from a directory
def load_pdf_from_directory(directory_name):
    loader = DirectoryLoader(directory_name,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

sample_docs = load_pdf_from_directory("data_pdf/")

In [None]:
# Recursive text splitter to split text into smaller chunks.
def split_the_texts(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_documents = text_splitter.split_documents(documents)
    return split_documents

text_chunks = split_the_texts(sample_docs)
print("Length of text chunks: ", len(text_chunks))
text_chunks

Length of text chunks:  8995


[Document(metadata={'source': 'data_pdf/the-gale-encyclopedia-of-medicine_compress.pdf', 'page': 0}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'),
 Document(metadata={'source': 'data_pdf/the-gale-encyclopedia-of-medicine_compress.pdf', 'page': 1}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2'),
 Document(metadata={'source': 'data_pdf/the-gale-encyclopedia-of-medicine_compress.pdf', 'page': 2}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\n

In [None]:
# Load the embeddings model
def download_huggingface_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="Sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_huggingface_embedding_model()

In [None]:
# Embeddings object parameter
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='Sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
# Testing whether the embedding model works properly or not
demo_embedding_result = embeddings.embed_query("Hello, how are you?")
print("The Length of the embedding result: ", len(demo_embedding_result))
print("The embedding result: ", demo_embedding_result)

The Length of the embedding result:  384
The embedding result:  [0.019096745178103447, 0.03446515277028084, 0.09162791073322296, 0.07016526162624359, -0.029946595430374146, -0.08419135957956314, 0.045813582837581635, 0.004958623088896275, -0.09189332276582718, 0.01740061491727829, -0.00881614163517952, -0.0006614544545300305, -0.028556959703564644, -0.021949755027890205, 0.05516670644283295, -0.04983645677566528, 0.08988089859485626, -0.08895716071128845, -0.11235620081424713, 0.039000559598207474, -0.06607074290513992, 0.026095163077116013, 0.03653067722916603, 0.06139037385582924, -0.05712493136525154, -0.05463940650224686, 0.03036554902791977, 0.032387517392635345, 0.012644659727811813, -0.10568573325872421, -0.05834553390741348, 0.06732944399118423, -0.04075591266155243, 0.006439807824790478, 0.005698672030121088, 0.05285317078232765, -0.039775289595127106, -0.11855245381593704, 0.0021161711774766445, -0.016692832112312317, 0.02833813615143299, -0.03743794932961464, -0.021371422335

In [None]:
db = Chroma.from_documents(text_chunks, embeddings)

In [None]:
query = "What is Cluster Headache?"
docs = db.similarity_search(query)
docs

[Document(metadata={'page': 212, 'source': 'data_pdf/the-gale-encyclopedia-of-medicine_compress.pdf'}, page_content='The primary cluster headache symptom is excruciat-\ning one-sided head pain centered behind an eye or nearthe temple. This pain may radiate outward from the ini-tial focus and encompass the mouth and teeth. For thisreason, some cluster headache sufferers may mistakenlyattribute their pain to a dental problem. Secondary symp-toms, occurring on the same side as the pain, include eyetearing, nasal congestion followed by a runny nose, pupilcontraction, and facial drooping or flushing.\nDiagnosis'),
 Document(metadata={'page': 213, 'source': 'data_pdf/the-gale-encyclopedia-of-medicine_compress.pdf'}, page_content='Cluster HeadacheAreas of pain\nNerves\nThe primary cluster headache symptom is excruciating one-\nsided head pain located behind an eye or near the temple.Secondary symptoms include eye tearing, nasal congestion,and a runny nose. (Illustration by Electronic Illustra

In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
# Defining the retriever
retriever = db.as_retriever()

In [None]:
from langchain_google_genai import GoogleGenerativeAI

In [None]:
llm = GoogleGenerativeAI(model="gemini-pro", google_api_key='AIzaSyBtHYRaYTujA9l-OMx_9B8iDR5rzdfmm1M')

In [None]:
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

In [None]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [None]:
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
conversational_rag_chain.invoke(
    {"input": "What is coagulation disorder?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'A coagulation disorder is a condition in which the body cannot control blood clotting. This can lead to excessive bleeding or the formation of blood clots in the blood vessels. The most common coagulation disorder is hemophilia, in which patients bleed for long periods of time before clotting.'

In [None]:
from langchain_core.messages import AIMessage

for message in store["abc123"].messages:
    if isinstance(message, AIMessage):
        prefix = "AI"
    else:
        prefix = "User"

    print(f"{prefix}: {message.content}\n")

User: What is coagulation disorder?

AI: A coagulation disorder is a condition in which the body cannot control blood clotting. This can lead to excessive bleeding or the formation of blood clots in the blood vessels. The most common coagulation disorder is hemophilia, in which patients bleed for long periods of time before clotting.



In [None]:
%%writefile app.py

# Streamlit app for the above project
import streamlit as st

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma

from langchain_google_genai import GoogleGenerativeAI

from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain_core.messages import AIMessage

# Load PDF from a directory
def load_pdf_from_directory(directory_name):
    loader = DirectoryLoader(directory_name,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

sample_docs = load_pdf_from_directory("data_pdf/")


# Recursive text splitter to split text into smaller chunks.
def split_the_texts(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function = len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_documents = text_splitter.split_documents(documents)
    return split_documents

text_chunks = split_the_texts(sample_docs)


# Load the embeddings model
def download_huggingface_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="Sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_huggingface_embedding_model()

# Storing the text chunks into the database
db = Chroma.from_documents(text_chunks, embeddings)

# Defining the retriever
retriever = db.as_retriever()

#Defining the chat-model
llm = GoogleGenerativeAI(model="gemini-pro", google_api_key='AIzaSyBtHYRaYTujA9l-OMx_9B8iDR5rzdfmm1M')

# Contextualized question prompt is needed for the llms to remember the context
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# Defining the History-aware retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

# Defining the system prompt and question-answer prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


# Defining the RunnableWithMessageHistory
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

# Streamlit app for chat interface
st.title("AI Chat Interface")

# Display chat history
for message in st.session_state.chat_history:
    if message["role"] == "user":
        st.write(f"User: {message['content']}")
    else:
        st.write(f"AI: {message['content']}")

# Store the session id if the chat session is not already present
if 'session_id' not in st.session_state:
    st.session_state.session_id = "abc123"
    st.session_state.chat_history = []

# Input from the user
user_input = st.text_input("User: ", "")

# If the user inputs something
if user_input:
    # Store the user and AI messages
    st.session_state.chat_history.append({"role": "user", "content": user_input})
    # Invoke the chain and store the response
    result = conversational_rag_chain.invoke(
        {"input": user_input},
        config={
            "configurable": {"session_id": st.session_state.session_id}
        }
    )["answer"]
    st.session_state.chat_history.append({"role": "ai", "content": result})
    st.experimental_rerun()  # To refresh the chat interface and display the latest messages

Writing app.py
