## Imports:

In [None]:
# Chat:
from operator import itemgetter
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
# History
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.messages import trim_messages
from langchain_core.runnables import RunnableWithMessageHistory, RunnablePassthrough
# Load
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
# Store
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
# Retrieve
from langchain.chains import create_retrieval_chain, create_history_aware_retriever

In [None]:
from IPython.display import Markdown
from llm import get_response_stream, get_response

In [None]:
for chunk in get_response("hello", dummy=True):
    print(chunk, end="", flush=True)

## LLM:

In [None]:
from langchain_ollama import ChatOllama
# Gemma3 context size -> 128K (1,31,072)
# 30k -> 91% RAM, 91% GPU
# 25k -> 82% RAM, 89% GPU
# 15k -> 66% RAM, 87% GPU

llm = ChatOllama(
    model="gemma3:latest", temperature="1",
    #  num_predict=MAX_OUTPUT_TOKENS,
    num_gpu=35, num_ctx=20000
)

In [None]:
# Markdown(llm.invoke("write a story").content)

## Template:

<div class="alert alert-info">
    <strong>Limitations:</strong> Listed are some base assumptions in certain components of langchain components.
</div>

- `CreateHistoryAwareRetriever` assumes the latest-user-message key to be `input`
- `Trimmer` assumes the `ChatHistory` key to be `messages`
- `CreateStuffDocumentChain` assumes returns the clubbed `docs` in key `context`
- To overcome this, you need to use `RunnablePassthrough` or RunnableMap and assign those keys and variables accordingly.
- But remember, you need to manually set such things for all the variables which u are using different than default.

- So it's always good to follow the default keys and avoid complexity in chains.

### Chat:

In [None]:
# template_chat = ChatPromptTemplate.from_messages(
#     messages=[
#         SystemMessage(
#             "You are a helpful assistant. You answer the question asked based on the chat history and also the Documents attached in context. Answer factually and clearly. State the source in answer wherever possible. Use various markdown features in response. \n<CONTEXT>\n{context}\n</CONTEXT>"),
#         MessagesPlaceholder(variable_name="messages"),
#         HumanMessage("{input}")
#     ]
# )
# template_chat

In [None]:
template_chat = ChatPromptTemplate.from_messages(
    messages=[
        ("system",  "".join([
            "You are a highly knowledgeable and helpful AI assistant.\n"
            "You are provided with the user's chat history and external documents to assist in your response.\n\n"
            "Your task is to:\n"
            "- Accurately and clearly answer the user's latest question.\n"
            "- Incorporate any relevant information from the context documents enclosed below.\n"
            # "- Reference the source(s) whenever applicable.\n"
            "- Use appropriate markdown formatting for clarity and readability (e.g., bullet points, headings, code blocks, tables).\n\n"
            "Contextual Documents:\n"
            "<CONTEXT>{context}</CONTEXT>"
        ])),
        MessagesPlaceholder(variable_name="messages"),
        ("human", "{input}")
    ]
)
template_chat

In [None]:
# Calculate tokens in this System message and pass rest of the max possible chat history:
# trim_keep = model_context - template_tokens - 250 (safe side)
# template_chat.messages[0].content

### Summarize:

In [None]:
# template_summarize = ChatPromptTemplate.from_messages(
#     messages=[
#         SystemMessage(
#             "You are a Summarizing expert. You are given with a complete chat history and the latest user message in end of it. The latest message might have some content which refers to some part in history. You have to compile everything and return a single prompt, which will have a standalone question which can be completely understood without any chat history. So, give me a single prompt which will be helpful in retrieving the most relevant docs to latest message."),
#         MessagesPlaceholder(variable_name="messages"),
#         HumanMessage("{input}")
#     ]
# )
# template_chat

In [None]:
template_summarize = ChatPromptTemplate.from_messages(
    messages=[
        ("system", "".join([
            "You are an expert at summarizing conversations into standalone prompts.\n"
            "You are given a complete chat history, ending with the user's latest message.\n\n"
            "Your task is to:\n"
            "- Understand the entire conversation context.\n"
            "- Identify references in the latest user message that relate to earlier messages.\n"
            "- Create a single clear, concise, and standalone question or prompt.\n"
            "- This final prompt should be fully understandable without needing the prior conversation.\n"
            "- It will be used to retrieve the most relevant documents.\n\n"
            "Only return the rewritten standalone prompt. Do not add explanations or formatting."
        ])),
        MessagesPlaceholder(variable_name="messages"),
        ("human",
         "{input}. \n\n **Make one standalone prompt as asked!**")
    ]
)
template_summarize

In [None]:
# Calculate tokens in this System message and pass rest of the max possible chat history:
# trim_keep = model_context - template_tokens - (1000tok/doc * n-docs) - 250 (safe side)
# template_summarize.messages

## Chat Message History:

In [None]:
chat_histories = {}

In [None]:
def get_session_history(session_id:str) -> BaseChatMessageHistory:
    # print("*"*40, session_id, "*"*40)
    if session_id not in chat_histories:
        chat_histories[session_id] = ChatMessageHistory()
        # log here for creation of new chat history
        print(f"Created chat hist for session id: `{session_id}`")    
    return chat_histories[session_id]

get_session_history("abv")

In [None]:
get_session_history("abv")

### Trimmer:

In [None]:
from langchain_core.messages import trim_messages

# For summary 15k chat + 1k system and all
trim_summary = trim_messages(
    max_tokens=15000,
    strategy="last", token_counter=llm, start_on="human",
    allow_partial=True,  # include_system=True,
)

# For chat 10k chat + 5*1k docs + 1k system and all
trim_chat = trim_messages(
    max_tokens=10000,
    strategy="last", token_counter=llm, start_on="human",
    allow_partial=True,  # include_system=True,
)

## VectorStore:
### Embeddings:

In [None]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")
embeddings

### Loader:

In [None]:
file = PyMuPDFLoader(file_path="./assets/pdf_w_text.pdf", extract_tables='markdown', extract_images=True).load()
file

### Splitter:

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=750, chunk_overlap=150,
)
splitter

### Database:

In [None]:
splitted = splitter.split_documents(file)
splitted

In [None]:
# This initialization needs 4 param, so rather moving to adding one doc manually.
database = FAISS.from_documents(documents=splitted, embedding=embeddings)
database

In [None]:
print(repr(splitted[0].page_content))
print(len(splitted[0].page_content.split(" ")))

### Retriever:

- So for 750 chars, there are appx 95 word (max 150)
- In order to retrieve the 3k tokens, we need to have 3k/150 = 20 chunks
- So, set k=20

In [None]:
retriever = database.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 20}
)
retriever

In [None]:
retriever.invoke("fun")

## Summarizer:

- Old method.
- This is too much hard-coded, switch to the retrieval method with the create_stuff_chain to ingest the documents and get the answer in one single chain call.

In [None]:
chain = (
    RunnablePassthrough().assign(messages=itemgetter("messages") | trim_chat)
    | template_summarize | llm | StrOutputParser())

summarizer_llm = RunnableWithMessageHistory(
    runnable=chain,
    get_session_history=get_session_history,
    input_messages_key="input",
    history_messages_key="messages",
)

In [None]:
chat_histories[10] = ChatMessageHistory()
chat_histories[10].messages = [
    HumanMessage("Hello, I'm Bhushan, What is your name?"),
    AIMessage("I am an AI assistant. I am not a human like you."),
    HumanMessage("What is Artificial General Intelligence?"),
    AIMessage("Artificial General Intelligence (AGI) refers to highly autonomous systems that outperform humans at most economically valuable work."),
]
# )

In [None]:
summarizer_llm.invoke(
    input={"input": "So it's not achieved yet?", },
    config={"configurable": {"session_id": 10}}
)

In [None]:
chat_histories[10].messages

## Chain:

In [None]:
# 3 User Input + Chat History > Summarizer Template > Standalone Que > Get Docs
summarize_chain = create_history_aware_retriever(llm, retriever, template_summarize)

# 4 Multiple Docs > Combine All > Chat Template > Final Output
qa_chain = create_stuff_documents_chain(llm=llm, prompt=template_chat)

# 2 Input + Chat History > [ `Summarizer Template` > `Get Docs` ] > [ `Combine` > `Chat Template` ] > Output
rag_chain = create_retrieval_chain(summarize_chain, qa_chain)

# 1 Final main chain:
conversational_rag_chain = RunnableWithMessageHistory(
    runnable=rag_chain,
    get_session_history=get_session_history,
    input_messages_key="input",
    history_messages_key="messages",
    output_messages_key="answer",
)
conversational_rag_chain

In [None]:
conversational_rag_chain.invoke(
    input={"input":"Hello, I am Bhushan. What abt u?"},
    config={"configurable":{"session_id":120}}
)

## Runnable With History:

In [None]:
chain = (
    RunnablePassthrough(name="Trim Chat History").assign(messages=itemgetter("messages") | trim_chat)
    | template_chat | llm | StrOutputParser())

chat_llm = RunnableWithMessageHistory(
    runnable=chain,
    get_session_history=get_session_history,
    input_messages_key="input",
    history_messages_key="messages",
)

In [None]:
chat_llm.invoke(
    input={
        "input": "Hello, I'm Bhushan, What is your name?",
        "context": "This is some random document which contains some random information."
    },
    config={
        "configurable": {
            "session_id": 15
        }
    }
)

In [None]:
chat_llm.invoke(
    input={
        "input": "What did we discuss?",
        "context": "There is no context available for this question."
    },
    config={
        "configurable": {
            "session_id": 15
        }
    }
)

- If () add option to paste link and scrap whole content from there.

In [None]:
def get_llm_response_stream(prompt: str) -> Generator[str, None, None]:
    """Returns the response from LLM for given prompt using Generator."""
    # Chat Prompt Template:
    template = ChatPromptTemplate.from_messages(
        messages=[
            ("system", "You are a helpful assistant '{llm_name}' who responds to questions in not more than 20 sentences. You can use markdown and code blocks to format your answers. You can also use emojis to make your answers more engaging. Please be concise and clear in your responses."),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{new_input}")
        ]
    )

    # Ensure model and provider are selected:
    if not st.session_state.provider:
        raise ValueError(
            "Provider not selected, please select a provider first.")

    if not st.session_state.model:
        raise ValueError("Model not selected, please select a model first.")

    # Set-up LLM:
    llm = None

    if st.session_state.provider == "OpenAI":
        from langchain_openai import ChatOpenAI
        llm = ChatOpenAI(
            model=st.session_state.model, api_key=st.secrets.OpenAI.API_KEY)

    elif st.session_state.provider == "Groq":
        from langchain_groq import ChatGroq
        llm = ChatGroq(
            model=st.session_state.model, api_key=st.secrets.Groq.API_KEY)

    elif st.session_state.provider == "Ollama":
        from langchain_ollama import ChatOllama
        llm = ChatOllama(model=st.session_state.model)

    else:
        st.error("Some un-expected error occurred...", icon="🤖")

    # Output parser:
    parser = StrOutputParser()

    # Chain with trimmer:
    # Trimmer:
    trimmer = trim_messages(
        max_tokens=2000, strategy="last",
        token_counter=llm, include_system=False,
        allow_partial=True, start_on=HumanMessage
    )

    # Chain them all:
    chain = (
        # Set "messages" key equal to chat_history
        RunnablePassthrough.assign(
            messages=itemgetter("chat_history") | trimmer)
        # Set "chat_history" key equal to "messages" (default output key of trimmer)
        | RunnablePassthrough.assign(chat_history=itemgetter("messages"))
        | template
        | llm
        | parser
    )
    # Tested and WORKING 🥳

    # # Chain without the trimmer:
    # # Comment out the trimmer and chain above to use this:
    # chain = (
    #     template
    #     | llm
    #     | parser
    # )

    llm_with_history = RunnableWithMessageHistory(
        runnable=chain,
        get_session_history=get_session_history,
        input_messages_key="new_input",
        history_messages_key="chat_history",
    )

    # Run the chain (streaming):
    yield from llm_with_history.stream(
        input={
            "new_input": prompt,
            "llm_name": st.session_state.name,
        }
    )