## Imports:

In [None]:
# env:
import os
from dotenv import load_dotenv
load_dotenv()
# Chat:
from operator import itemgetter
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
# History
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.messages import trim_messages
from langchain_core.runnables import RunnableWithMessageHistory, RunnablePassthrough
# Load
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
# Store
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
# Retrieve
from langchain.chains import create_retrieval_chain, create_history_aware_retriever

In [None]:
from IPython.display import Markdown
# from llm import get_response_stream, get_response

In [None]:
# for chunk in get_response("hello", dummy=True):
#     print(chunk, end="", flush=True)

## LLM:

In [None]:
MAX_TOKENS = 16000
PER_DOC_TOKENS = 750
SYS_PROMPT_SIZE = 1000 # assumed
TOTAL_DOC_SIZE = 3000
DOC_COUNT = TOTAL_DOC_SIZE // PER_DOC_TOKENS

### Ollama - Gemma3:4b:

In [None]:
from langchain_ollama import ChatOllama

# Gemma3 context size -> 128K (1,31,072)
# 30k -> 91% RAM, 91% GPU
# 25k -> 82% RAM, 89% GPU
# 15k -> 66% RAM, 87% GPU


llm = ChatOllama(
    model="gemma3:latest", temperature=1,
    # model="gemma3:1b", temperature=1,
    #  num_predict=MAX_OUTPUT_TOKENS,
    num_gpu=35, num_ctx=MAX_TOKENS
)
llm.invoke("Hii")

### Groq - Llama3:70B:

In [None]:
# from langchain_groq import ChatGroq
# llm = ChatGroq(
#     model="llama-3.3-70b-versatile", temperature="1",
#     max_tokens=MAX_TOKENS, api_key=os.environ.get("GROQ_API_KEY"),
# )
# llm.invoke('hi')

In [None]:
# Markdown(llm.invoke("write a story").content)

## Template:

<div class="alert alert-info">
    <strong>Limitations:</strong> Listed are some base assumptions in certain components of langchain components.
</div>

- `CreateHistoryAwareRetriever` assumes the latest-user-message key to be `input`
- `Trimmer` assumes the `ChatHistory` key to be `messages`
- `CreateStuffDocumentChain` assumes returns the clubbed `docs` in key `context`
- To overcome this, you need to use `RunnablePassthrough` or RunnableMap and assign those keys and variables accordingly.
- But remember, you need to manually set such things for all the variables which u are using different than default.

- So it's always good to follow the default keys and avoid complexity in chains.

### Chat:

In [None]:
template_chat = ChatPromptTemplate.from_messages(
    messages=[
        ("system",  "".join([
            "You are a highly knowledgeable and helpful AI assistant.\n"
            "You are provided with the user's chat history and external documents to assist in your response.\n\n"
            "Your task is to:\n"
            "- Accurately and clearly answer the user's latest question.\n"
            "- Incorporate any relevant information from the context documents enclosed below.\n"
            # "- Reference the source(s) whenever applicable.\n"
            "- Use appropriate markdown formatting for clarity and readability (e.g., bullet points, headings, code blocks, tables).\n\n"
            "- If not available in the context, mention that and then answer from your own knowledge.\n"
            "Contextual Documents:\n"
            "<CONTEXT>{context}</CONTEXT>"
        ])),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input} \n\n **Strictly stick to the instructions!**")
    ]
)
template_chat

In [None]:
# Calculate tokens in this System message and pass rest of the max possible chat history:
# trim_keep = model_context - template_tokens - 250 (safe side)
# template_chat.messages[0].content

### Summarize:

In [None]:
template_summarize = ChatPromptTemplate.from_messages(
    messages=[
        ("system", "".join([
            "You are an expert at summarizing conversations into standalone prompts.\n"
            "You are given a complete chat history, ending with the user's latest message.\n\n"
            "Your task is to:\n"
            "- Understand the entire conversation context.\n"
            "- Identify references in the latest user message that relate to earlier messages.\n"
            "- Create a single clear, concise, and standalone question or prompt.\n"
            "- This final prompt should be fully understandable without needing the prior conversation.\n"
            "- It will be used to retrieve the most relevant documents.\n\n"
            "Only return the rewritten standalone prompt. Do not add explanations or formatting."
        ])),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}. \n\n **Make one standalone prompt as asked!**")
    ]
)
template_summarize

In [None]:
# Calculate tokens in this System message and pass rest of the max possible chat history:
# trim_keep = model_context - template_tokens - (1000tok/doc * n-docs) - 250 (safe side)
# template_summarize.messages

## Chat Message History:

In [None]:
chat_histories = {}

In [None]:
def get_session_history(session_id:str) -> BaseChatMessageHistory:
    # print("*"*40, session_id, "*"*40)
    if session_id not in chat_histories:
        chat_histories[session_id] = ChatMessageHistory()
        # log here for creation of new chat history
        print(f"Created chat hist for session id: `{session_id}`")    
    return chat_histories[session_id]

get_session_history("abv")

In [None]:
get_session_history("abv")

### Trimmer:
- Due to some un-known issue in variable name of messages being "chat_history" or "messages" the trimmer cant be used in this RAG implementation.
- Reason: Trimmer expects "messages"
- But, if i use "messages", then idk why, the summarizer step does not call LLM at all, it just does not work, and is completely untraceable.
- Still, if u want to implement, use one runnable_passthrough before the trimmer in chain to convert chat_history > messages and the after its response, output > chat_history again! 

In [None]:
# # For summary 15k chat + 1k system and all
# trim_summary = trim_messages(
#     max_tokens=MAX_TOKENS - SYS_PROMPT_SIZE,
#     strategy="last", token_counter=llm, start_on="human",
#     allow_partial=True,  # include_system=True,
# )

# # For chat 10k chat + 5*1k docs + 1k system and all
# trim_chat = trim_messages(
#     max_tokens=MAX_TOKENS - (TOTAL_DOC_SIZE) - SYS_PROMPT_SIZE,
#     strategy="last", token_counter=llm, start_on="human",
#     allow_partial=True,  # include_system=True,
# )

## VectorStore:
### Embeddings:
- Notice that the embeddings are not offloaded at all to the GPU
- This is done because the Ollama repeatedly keeps loading and un-loading the emb / llm in each call.
- Even when I have memory, IDK why ollama loads only one of them?

In [None]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest", num_gpu=0)
embeddings

### Loader:

In [None]:
file = PyMuPDFLoader(file_path="../assets/pdf_w_text.pdf", extract_tables='markdown', extract_images=True).load()
file

### Splitter:

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=PER_DOC_TOKENS, chunk_overlap=150,
)
splitter

### Database:

In [None]:
splitted = splitter.split_documents(file)
splitted

In [None]:
# This initialization needs 4 param, so rather moving to adding one doc manually.
database = FAISS.from_documents(documents=splitted, embedding=embeddings)
database

In [None]:
print(repr(splitted[0].page_content))
print(len(splitted[0].page_content.split(" ")))

### Retriever:

- So for 750 chars, there are appx 95 word (max 150)
- In order to retrieve the 3k tokens, we need to have 3k/150 = 20 chunks
- So, set k=20

In [None]:
retriever = database.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 20}
)
retriever

In [None]:
retriever.invoke("fun")

## Summarizer:

- Old method.
- This is too much hard-coded, switch to the retrieval method with the create_stuff_chain to ingest the documents and get the answer in one single chain call.

In [None]:
# chain = (
#     RunnablePassthrough().assign(messages=itemgetter("messages") | trim_chat)
#     | template_summarize | llm | StrOutputParser())

# summarizer_llm = RunnableWithMessageHistory(
#     runnable=chain,
#     get_session_history=get_session_history,
#     input_messages_key="input",
#     history_messages_key="messages",
# )

In [None]:
# chat_histories[10] = ChatMessageHistory()
# chat_histories[10].messages = [
#     HumanMessage("Hello, I'm Bhushan, What is your name?"),
#     AIMessage("I am an AI assistant. I am not a human like you."),
#     HumanMessage("What is Artificial General Intelligence?"),
#     AIMessage("Artificial General Intelligence (AGI) refers to highly autonomous systems that outperform humans at most economically valuable work."),
# ]
# # )

In [None]:
# summarizer_llm.invoke(
#     input={"input": "So it's not achieved yet?", },
#     config={"configurable": {"session_id": 10}}
# )

In [None]:
# chat_histories[10].messages

## Runnable With History:
- Commented out as it's un-necessary and not used in the code.
- But, keep it, as it can be used in the future.

In [None]:
# chain = (
#     RunnablePassthrough(name="Trim Chat History").assign(messages=itemgetter("messages") | trim_chat)
#     | template_chat | llm | StrOutputParser())

# chat_llm = RunnableWithMessageHistory(
#     runnable=chain,
#     get_session_history=get_session_history,
#     input_messages_key="input",
#     history_messages_key="messages",
# )

In [None]:
# chat_llm.invoke(
#     input={
#         "input": "Hello, I'm Bhushan, What is your name?",
#         "context": "This is some random document which contains some random information."
#     },
#     config={
#         "configurable": {
#             "session_id": 15
#         }
#     }
# )

In [None]:
# chat_llm.invoke(
#     input={
#         "input": "What did we discuss?",
#         "context": "There is no context available for this question."
#     },
#     config={
#         "configurable": {
#             "session_id": 15
#         }
#     }
# )

- If () add option to paste link and scrap whole content from there.

## Chain:

In [None]:
# Make a passthrough which prints variables and passes them to next step
# def print_and_pass(input):
#     print(input)
#     return input

In [None]:
# 3 User Input + Chat History > Summarizer Template > Standalone Que > Get Docs
summarize_chain = create_history_aware_retriever(llm, retriever, template_summarize)
# summarize_chain = trim_summary | create_history_aware_retriever(llm, retriever, template_summarize)

# 4 Multiple Docs > Combine All > Chat Template > Final Output
qa_chain = create_stuff_documents_chain(llm=llm, prompt=template_chat)

# 2 Input + Chat History > [ `Summarizer Template` > `Get Docs` ] > [ `Combine` > `Chat Template` ] > Output
rag_chain = create_retrieval_chain(summarize_chain , qa_chain)

# 1 Final main chain:
conversational_rag_chain = RunnableWithMessageHistory(
    runnable=rag_chain,
    get_session_history=get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)
conversational_rag_chain

## Test:

### Database:

In [None]:
database.add_documents(
    [
        Document("Cats and Dogs are both popular pets."),
        Document("Cats are independent and low-maintenance pets."),
        Document("Dogs are loyal and require more attention."),
        Document("Cats are often seen as aloof and mysterious."),
        Document("Dogs are known for their loyalty and companionship."),
        Document("Cats are great for small living spaces."),
        Document("Cats are NOT AT ALL LOYAL."),
    ],
    embedding=embeddings
)

In [None]:
database.search(search_type='similarity', query="animals", k=8)

### Summarize Chain:

In [None]:
summarize_chain.invoke(
    input={
        "input": "What animal was i talking about? Which one is most common apart from that animal?",
        "chat_history": [
            HumanMessage("Hello, I'm Bhushan, What is your name?"),
            AIMessage("I am an AI assistant. I am not a human like you."),
            HumanMessage("What are ur thoughts on DOGs?"),
            AIMessage("Dogs are loyal and require more attention."), 
        ]
    },
)

### QA - Chain:

In [None]:
qa_chain.invoke(
    input={
        "input": "Full form of RAG?",
        "context": [Document(page_content="This is some random document which contains some random information.")],
        "chat_history": [
            HumanMessage("hi"),
            AIMessage("hello"),
            HumanMessage("What is RAG?"),
            AIMessage("RAG is a technique to combine retrieval and generation."),
        ]
    },
    # config={"configurable": {"session_id": 15}}
)

### RAG Chain:

In [None]:
rag_chain.invoke(
    input={
        "input": "Full form of RAG?",
        "context": [Document(page_content="This is some random document which contains some random information.")],
        "chat_history": [
            HumanMessage("hi"),
            AIMessage("hello"),
            HumanMessage("What is RAG?"),
            AIMessage("RAG is a technique to combine retrieval and generation."),
        ]
    },
    # config={"configurable": {"session_id": 15}}
)

### Conv RAG:

In [None]:
conversational_rag_chain.invoke(
    input={"input":"Hello, I am Bhushan. What abt u?"},
    config={"configurable":{"session_id":120}}
)

In [None]:
conversational_rag_chain.invoke(
    input={"input":"What are popular pets?"},
    config={"configurable":{"session_id":120}}
)

In [None]:
conversational_rag_chain.invoke(
    input={"input":"Describe CATS?"},
    config={"configurable":{"session_id":120}}
)

In [None]:
conversational_rag_chain.invoke(
    input={"input":"1. Are they LOYAL? 2. What do I mean my THEY?"},
    config={"configurable":{"session_id":120}}
)

# Filter data by users:

## Add:

In [None]:
sun_docs = [
    Document(
        page_content="The Sun is a nearly perfect sphere of hot plasma, at the center of the Solar System.",
        metadata={"session_id": "user_1"}
    ),
    Document(
        page_content="It is composed primarily of hydrogen (about 74%) and helium (about 24%) by mass.",
        metadata={"session_id": "user_1"}
    ),
    Document(
        page_content="The Sun's core temperature reaches approximately 15 million degrees Celsius.",
        metadata={"session_id": "user_1"}
    ),
    Document(
        page_content="Solar flares and sunspots are caused by magnetic field activity on the Sun's surface.",
        metadata={"session_id": "user_1"}
    ),
    Document(
        page_content="Light from the Sun takes about 8 minutes and 20 seconds to reach Earth.",
        metadata={"session_id": "user_1"}
    )
]

In [None]:
moon_docs = [
    Document(
        page_content="The Moon is Earth's only natural satellite and the fifth largest moon in the Solar System.",
        metadata={"session_id": "user_2"}
    ),
    Document(
        page_content="It is about 1/6th the size of Earth and has a diameter of about 3,474 kilometers.",
        metadata={"session_id": "user_2"}
    ),
    Document(
        page_content="The Moon's surface is covered with craters, mountains, and flat plains called maria.",
        metadata={"session_id": "user_2"}
    ),
    Document(
        page_content="The Moon has no atmosphere, which means it cannot support life as we know it.",
        metadata={"session_id": "user_2"}
    ),
    Document(
        page_content="The Moon's gravitational pull affects Earth's tides.",
        metadata={"session_id": "user_2"}
    )
]

In [None]:
earth_docs = [
    Document(
        page_content="Earth is the third planet from the Sun and the only known planet to support life.",
        metadata={"session_id": "user_3"}
    ),
    Document(
        page_content="It has a diameter of about 12,742 kilometers and is composed of rock and metal.",
        metadata={"session_id": "user_3"}
    ),
    Document(
        page_content="Earth's atmosphere is composed primarily of nitrogen (about 78%) and oxygen (about 21%).",
        metadata={"session_id": "user_3"}
    ),
    Document(
        page_content="The Earth has one natural satellite, the Moon.",
        metadata={"session_id": "user_3"}
    ),
    Document(
        page_content="Earth's surface is covered by about 71% water.",
        metadata={"session_id": "user_3"}
    )
]

In [None]:
common_docs = [
    Document(
        page_content="Moon revolves around Earth, while Earth revolves around the Sun.",
        metadata={"session_id": "public"}
    ),
    Document(
        page_content="Earth, Sun and Moon all three are part of the Solar System.",
        metadata={"session_id": "public"}
    ),
]

## Embed:

In [None]:
user_1_docs = database.add_documents(sun_docs, embedding=embeddings)
user_2_docs = database.add_documents(moon_docs, embedding=embeddings)
user_3_docs = database.add_documents(earth_docs, embedding=embeddings)
public_docs = database.add_documents(common_docs, embedding=embeddings)
user_3_docs

## Retrieve with filters:
- https://python.langchain.com/docs/integrations/vectorstores/faiss/#query-directly

In [None]:
retriever = database.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'k': 4, 'score_threshold': 0.25}
)
retriever

In [None]:
retriever.invoke("What is the Sun?", filter={"session_id": "user_1"})

In [None]:
retriever.invoke("What is the Earth?", filter={"session_id": "user_3"})

## Delete:

In [None]:
database.delete(user_3_docs)

In [None]:
retriever.invoke("What is the Earth?", filter={"session_id": "user_3"})

## Multiple Conditional Filters:
- https://github.com/langchain-ai/langchain/discussions/20202

In [None]:
retriever.invoke(
    input="Sun or Moon or Earth?",
    filter={
        "$or": [
            {"session_id": "user_2"},
            {"session_id": "public"},
        ]
    })

> So basically:
- When user uploads something, assign it a id in metadata
- Save the ids of embedded docs user-wise in some database
- Use the filter to filter the docs by user-id + public docs
- Once done, use the list of ids from database to delete the user data
- Maybe using in memory SQLite would be best option

# Testing:
`Note:`
- In this file so far I used 'session_id' for metadata
- But, in project, it is 'user_id'

## Loading FAISS (disk) and checking all docs:

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")

db = FAISS.load_local(
    folder_path="./user_faiss",
    index_name="index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

In [None]:
db.similarity_search(
    query="a",
    filter={
        "$or": [
            {"user_id": "nervous_nerd"},
            {"user_id": "public"}
        ]
    },
    kwargs={"k": 5}
)

In [None]:
db.similarity_search(
    query="ballot",
    filter={"user_id": "nervous_nerd"}
    # kwargs={"k": 5, "score_threshold": 0}
)

In [None]:
db.similarity_search("")

## Building As retriever with filters:
+ `Issue is that we are unable to set some filter while invoking RAG chain.`

- Tried, but failed:
    - https://github.com/langchain-ai/langchain/issues/9195#issuecomment-1810893811
- Working Solution:
    - https://github.com/langchain-ai/langchain/issues/9195#issuecomment-2095196865

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")

db = FAISS.load_local(
    folder_path="./user_faiss",
    index_name="index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

retriever = db.as_retriever()

In [None]:
from langchain_core.runnables import ConfigurableField
configurable_retriever = retriever.configurable_fields(
    search_kwargs=ConfigurableField(
        id="search_kwargs",
        name="Search Kwargs",
        description="The search kwargs to use",
    )
)

In [None]:
config = {
    "configurable": {
        "search_kwargs": {
            "k": 5,
            "search_type": "similarity",
            # And here comes the hero:
            "filter": {
                "$or": [
                    {"user_id": "curious_cat"},
                    {"user_id": "public"}
                ]
            },
        }
    }
}

In [None]:
configurable_retriever.invoke(
    input="What is the Sun?",
    config=config
)

<div class="alert alert-warning">
    <h1>Important:</h1> 
    <!-- <h1><strong>Important:</strong></h1>  -->
</div>

- Just figured this out
- if u are having LLM call in the chain, but still it is not working, the possible reason is `ChatPromptTemplate`.
- If history is empty, then Template is skipped
- And maybe hence, all further calls as well!!
- So, if LLM is not getting called, try passing some history manually

## My issue
- I was using "messages" for the chat history
- Cause, trimmer expects "messages" key for input
- But, somehow, prompt template was not able to use "messages" key even though it was set explicitly like that.
- Once replaced with "chat_history", it worked.
- Also, for output always use "answer" key (in create hist aware retriever in Conversational RAG)