##### IMPORT

In [6]:
import os
import json
import tqdm 
import pandas as pd
from operator import itemgetter
from IPython.display import display, HTML, Markdown

from langchain_groq import ChatGroq
from langchain_cerebras import ChatCerebras

from langchain_openai import AzureChatOpenAI
from langchain_community.vectorstores import Chroma, FAISS
from langchain_community.document_loaders import PyPDFLoader, PDFPlumberLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
    AIMessage,
    trim_messages
)
from langchain_core.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder
)
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

from dotenv import load_dotenv
load_dotenv()
os.environ['HF_TOKEN'] = os.getenv("HF_TOKEN")
os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")
os.environ['CEREBRAS_API_KEY'] = os.getenv("CEREBRAS_API_KEY")
os.environ['ENDPOINT_URL'] = os.getenv("ENDPOINT_URL")
os.environ['AZURE_OPENAI_API_KEY'] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ['AZURE_OPENAI_API_VERSION'] = os.getenv("AZURE_OPENAI_API_VERSION")

In [7]:
sample_questions = [
    "Current revenue model - Fixed Fee/Time & Material Contract/Volume based Invoicing (Please select multiple option if required)",
    "Are we required to share FTE details at the time of invoicing?",
    "Does GEP have a negotiated rate card with the client?",
    "Have we built any year-over-year efficiencies into the solution/SOW? If yes, what are the committed efficiency targets?",
    "What are the start and end dates of this SOW",
    "what is the renewal clause",
    "Is there Termination for convenience",
    "Is travel billable or not",
    "what are the major SLA's",
    "is there clause for fees at risk",
    "What are the payment terms",
    "is there Clause for COLA",
    "what is the category of work we are doing",
    "What is the credit period of the contract",
    "what is the invoicing schedule"
]

In [11]:
embeddings = HuggingFaceEmbeddings(model_name="gemini-embedding-001")
# vectordb=Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

No sentence-transformers model found with name sentence-transformers/gemini-embedding-001. Creating a new one with mean pooling.


OSError: sentence-transformers/gemini-embedding-001 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

##### INGESTION - SPLITTING - EMBEDDINGS - PROMPT

In [10]:
BASE_DIR='./FW__Contracts'
doc_list=os.listdir(BASE_DIR)
doc0_name=doc_list[0]
doc0=PDFPlumberLoader(BASE_DIR+'/'+doc0_name).load()

text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
f_docs=text_splitter.split_documents(doc0)

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vectordb=Chroma.from_documents(documents=f_docs,embedding=embeddings, persist_directory="./chroma_db")

vectordb.similarity_search(sample_questions[0])
#? now give the above as context to llm ChatGroq Model
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question strictly based on the provided context. Keep the response highly brief, short, and crisp.:
    <context>
    {context}
    </context>
    """)

llm = ChatCerebras(model="gpt-oss-120b", cerebras_api_key=os.environ['CEREBRAS_API_KEY'])
# llm = ChatGroq(model="Gemma2-9b-It", groq_api_key=os.environ['GROQ_API_KEY'])

retrieval_chain=create_retrieval_chain(vectordb.as_retriever(), create_stuff_documents_chain(llm, prompt))

#* OUTPUT FORMAT
# results = []
# for i in tqdm.tqdm(range(len(sample_questions))):
#     query = sample_questions[i]
#     response = retrieval_chain.invoke({"input": query})
#     results.append([query, response['answer']])
# df = pd.DataFrame(results, columns=["Question", "Answer"])
# with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
#     display(df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
#         [{'selector': 'th', 'props': [('text-align', 'left')]}]
    # ))

Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

##### FULL_CHAIN - 1Doc

In [8]:
llm = ChatGroq(model="Gemma2-9b-It", groq_api_key=os.environ["GROQ_API_KEY"])
prompt = ChatPromptTemplate.from_messages([
    ("system",
     """Answer the following question strictly based on the provided context.
     Keep the response highly brief, short, and crisp.:
     <context>
     {context}
     </context>"""
    ),
    MessagesPlaceholder(variable_name="messages")
])

retrieval_chain = create_retrieval_chain(vectordb.as_retriever(), create_stuff_documents_chain(llm, prompt))
store = {}
def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

trimmer = trim_messages(
    max_tokens=45,
    strategy="last",
    token_counter=llm,
    include_system=True,
    allow_partial=False,
    start_on="human"
)

chain = (
    RunnablePassthrough.assign(messages=itemgetter("messages") | trimmer)
    | retrieval_chain
    | (lambda x: {"output": x["answer"], **x})
)

with_message_history = RunnableWithMessageHistory(
    chain,
    get_session_history,
    input_messages_key="messages"
)

config = {"configurable": {"session_id": "test_rag1"}}
query = "which are the 2 parties between which the contract is signed?"
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=query)],
        "input": query,
    },
    config=config,
)

print(response["answer"])

WestRock and Supplier 



In [9]:
query = "what was the previous question I asked?"
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=query)],
        "input": query,
    },
    config=config,
)

print(response["answer"])

You asked: which are the 2 parties between which the contract is signed? 



Let me know if you have any other questions! 😊



In [11]:
query = "Can you give an appropriate name for this whole contract? like it should be a name of a person or a company"
response = with_message_history.invoke(
    {
        "messages": [HumanMessage(content=query)],
        "input": query,
    },
    config=config,
)

print(response["answer"])

WestRock Supplier Contract Template Framework  



##### EVALUATION

In [6]:
results = []
config = {"configurable": {"session_id": "batch_test"}}

for query in tqdm.tqdm(sample_questions):
    response = with_message_history.invoke(
        {
            "messages": [HumanMessage(content=query)],
            "input": query,
        },
        config=config,
    )
    results.append([query, response["output"]])  
    
df = pd.DataFrame(results, columns=["Question", "Answer"])
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(
        df.style.set_properties(**{'text-align': 'left'})
        .set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])
    )

  0%|          | 0/15 [00:00<?, ?it/s]Error in RootListenersTracer.on_chain_end callback: KeyError('output')
  0%|          | 0/15 [00:00<?, ?it/s]


KeyError: 'output'