In [None]:
! pip install langchain-openai langchain-community langchainhub gpt4all langchain-chroma chromadb langchain pypdf

In [None]:
# Clone books
# ! git clone https://github.com/aridiosilva/AI_Books.git

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

# Load entire directory
# loader = DirectoryLoader("AI_Books/", glob="*.pdf", loader_cls=PyPDFLoader, use_multithreading=True)

# Load single file
loader = PyPDFLoader("https://arxiv.org/pdf/2103.15348.pdf", extract_images=False)
pages = loader.load()

In [None]:
# Get Text from all pages
txt = ' '.join([d.page_content for d in pages])

In [None]:
print("Pages: ",len(pages))
print("Text lenght: ", len(txt))

In [None]:
# Split Text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 250,
    add_start_index=True
)

In [None]:
chunks = text_splitter.split_documents(pages)

In [None]:
# Count chunks
len(chunks)

## Vectorstore

In [None]:
! pip install chromadb

In [None]:
!rm -rf /docs/chroma/

In [None]:
from langchain_chroma import Chroma
persist_directory = 'docs/chroma/'

In [None]:
# Embedding with GPT4All
from langchain_openai import AzureOpenAIEmbeddings
import os

os.environ["AZURE_OPENAI_API_KEY"] = "659e22161ff147278d70b707e6a78cc7"
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://agilefreaks-openai-sweeden.openai.azure.com/"
os.environ["AZURE_OPENAI_DEPLOYMENT"] = "gpt-35-turbo"
os.environ["AZURE_OPENAI_DEPLOYMENT_4"] = "gpt-4o"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-12-01-preview"

embedding_function = AzureOpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=chunks, 
    embedding=embedding_function,
    persist_directory=persist_directory
    )

print(f"Saved chunks to {persist_directory}")

## Prepare DB

In [None]:
db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

### Small test

In [None]:
print(db._collection.count())

In [None]:
query_text = "How will the community be engaged?"

In [None]:
# k is number of results we want to return
results = db.similarity_search_with_score(query_text, k=3)

if len(results) == 0 or results[0][1] < 0.7:
    print("Unable to find matching results.")
else:
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {context_text}\nSources: {sources}"
    print(formatted_response)

## RAG

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import os

model = AzureChatOpenAI(
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# Store for chat sessions
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

# Create a simple chain without memory for RAG
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
DO NOT give irelevant information that is not in the context.
---
Answer the question based on the above context: {question}
"""

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

# Create the RAG chain
rag_chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | prompt_template
    | model
    | StrOutputParser()
)

# If you need conversation history, wrap with RunnableWithMessageHistory
conversation_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="question",
    history_messages_key="chat_history",
)

In [None]:
query_text = "How will the community be engaged?"

In [None]:
results = db.similarity_search_with_score(query_text, k=3)

In [None]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

# Use the modern RAG chain
response_text = rag_chain.invoke({
    "context": context_text,
    "question": query_text
})

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)