In [52]:
from langchain_core.language_models import chat_models
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain.schema.runnable import RunnableMap,RunnableLambda
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

In [53]:
class ThresholdSematicChunker:
    def __init__(self,model_name="all-MiniLM-L6-v2",threshold=0.7):
        self.model=SentenceTransformer(model_name)
        self.threshold=threshold

    def split(self,text:str):
        sentences=[s.strip()for s in text.split("\n") if s.strip()]
        embedding=self.model.encode(sentences)
        chunks=[]
        current_chunks=[sentences[0]]
        for i in range(1,len(sentences)):
            sim=cosine_similarity([embedding[i-1]],
            [embedding[i]])[0][0]

            if sim>=self.threshold:
                current_chunks.append(sentences[i])
            else:
                chunks.append(" ".join(current_chunks))
                current_chunks=[sentences[i]]
        chunks.append(" ".join(current_chunks))
        return chunks
        
    def split_doc(self,docs):
        results=[]
        for chunk in self.split(docs.page_content):
            results.append(Document(page_content=chunk,metadata=docs.metadata))
        return results




In [54]:
from importlib import metadata


sample_text=""" Langchain is a framework for building application with llms
Langchain provides modular abstractions to combine LLMs with tools like OpenAI, Groq, Huggingface, etc
You can create chains,agents,tools,flows,etc
the eiffel tower is a tower
the eiffel tower is in Paris
France is a popular tourist destination. """
doc=Document(page_content=sample_text)
doc

Document(metadata={}, page_content=' Langchain is a framework for building application with llms\nLangchain provides modular abstractions to combine LLMs with tools like OpenAI, Groq, Huggingface, etc\nYou can create chains,agents,tools,flows,etc\nthe eiffel tower is a tower\nthe eiffel tower is in Paris\nFrance is a popular tourist destination. ')

In [55]:
chunker=ThresholdSematicChunker(threshold=0.7)
chunks=chunker.split_doc(doc)
chunks

[Document(metadata={}, page_content='Langchain is a framework for building application with llms Langchain provides modular abstractions to combine LLMs with tools like OpenAI, Groq, Huggingface, etc'),
 Document(metadata={}, page_content='You can create chains,agents,tools,flows,etc'),
 Document(metadata={}, page_content='the eiffel tower is a tower the eiffel tower is in Paris'),
 Document(metadata={}, page_content='France is a popular tourist destination.')]

In [56]:
import os
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
embedding=OpenAIEmbeddings()
vectorstore=FAISS.from_documents(chunks,embedding)
retriever=vectorstore.as_retriever()

In [57]:
template="""Answer the question based on following context:
{context}
Question:{question}
"""
prompt=PromptTemplate.from_template(template)
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on following context:\n{context}\nQuestion:{question}\n')

In [59]:
from langchain_core.runnables import RunnableMap
from langchain_core.output_parsers import StrOutputParser

llm = init_chat_model(model="groq:gemma2-9b-it", temperature=0.4)

rag_chain = (
    RunnableMap({
        "context": lambda x: retriever.invoke(x["question"]),
        "question": lambda x: x["question"]
    })
    | prompt
    | llm
    | StrOutputParser()
)
query={"question":"what is python?"}
result=rag_chain.invoke(query)
result

'The provided text does not contain the answer to what Python is. \n\nThe documents discuss Langchain, a framework for building applications with large language models (LLMs), and provide some information about the Eiffel Tower and France.  There is no mention of Python. \n'