In [44]:
from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import Chroma
from langchain.vectorstores import VectorStore 


from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
#from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.memory import ConversationBufferMemory
#retriever personalizado
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from typing import List

#from langchain_community.embeddings.ollama import OllamaEmbeddings

from langchain.prompts import PromptTemplate
#from langchain_core.prompts import ChatPromptTemplate
#from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser



In [7]:
#llm = ChatOllama(model = 'llama3', format = 'json', temperature=0)
llm = Ollama(base_url="http://localhost:11434",
             model="mistral",
             verbose=True,  
             callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))

hugg_embeddings = HuggingFaceEmbeddings(model_name= "mxbai-embed-large-v1")

In [3]:
myFile = 'annualreport.pdf'
loader = PyPDFLoader("DATA/" + myFile)

data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
    )

all_splits = text_splitter.split_documents(data)

In [4]:
vectorstore = Chroma.from_documents(
        documents=all_splits,
        embedding=hugg_embeddings
    )

In [5]:
#vectorstore.get(include=['embeddings'])['embeddings']
query = 'Who is Haroon Sheikh?'

vectorstore.similarity_search_by_vector(hugg_embeddings.embed_query(query))

[Document(page_content='sector and one of the UK’s leading \nentrepreneurs and philanthropists. Along \nwith his brother Farouq, he co-founded \nCareTech. As Group CEO he actively leads \nthe day-to-day running of the Group \nand its international expansion, and has \nbeen instrumental in assembling a highly \ntalented leadership team, to support \nthe continued growth of the business. \nHaroon brings commercial acumen, \nrelated industry experience and property \nknowledge. He has a deep commitment', metadata={'page': 40, 'source': 'DATA/annualreport.pdf'}),
 Document(page_content='facilitates a culture of openness and debate.\nGroup Chief Executive Officer\nHaroon Sheikh is the Group CEO and \naccountable to the Board for the day-to-day \nrunning of the Group and management of  \nthe strategic plan.\nHaroon Sheikh is responsible for the following:\n –Executive leadership of the Group’s \nbusiness on a day-to-day basis.\n –Developing the overall commercial \nobjectives, and proposing 

In [55]:
template = """
    You are a knowledgeable chatbot, here to help with questions of the user.
    Your tone should be professional and informative.

    Context: {context}
    History: {history}

    User: {question}
    Chatbot:
    It is mandatorian that only if the answer is not in the context, answer "I have not enough context in order to answer this" and stop the answer.
    Try to use the memory context in the answer only if the question mentions it.
"""

prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=template,
)

memory = ConversationBufferMemory(
    memory_key="history",
    return_messages=True,
    input_key="question"
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": memory,
    }
)

result = qa_chain({"query": query})
result



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are a knowledgeable chatbot, here to help with questions of the user.
    Your tone should be professional and informative.

    Context: sector and one of the UK’s leading 
entrepreneurs and philanthropists. Along 
with his brother Farouq, he co-founded 
CareTech. As Group CEO he actively leads 
the day-to-day running of the Group 
and its international expansion, and has 
been instrumental in assembling a highly 
talented leadership team, to support 
the continued growth of the business. 
Haroon brings commercial acumen, 
related industry experience and property 
knowledge. He has a deep commitment

facilitates a culture of openness and debate.
Group Chief Executive Officer
Haroon Sheikh is the Group CEO and 
accountable to the Board for the day-to-day 
running of the Group and management of  
the

{'query': 'Who is Haroon Sheikh?',
 'result': " Haroon Sheikh is the Group CEO of CareTech, a leading healthcare and support services company based in the UK. He is responsible for the day-to-day running of the business and its international expansion. Haroon brings commercial acumen, industry experience, and property knowledge to his role. He is also a Founder Trustee and Chairman of the CareTech Charitable Foundation, and holds various other roles such as Patron and Enterprise Fellow of the Prince's Trust. Haroon has a deep commitment to delivering high-quality care and support to people with complex needs. He and his brother Farouq co-founded CareTech in 2004 and were recipients of the Coutts Family Business Prize in 2008."}

In [45]:
prompt_1 = PromptTemplate(
    template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n
    If the document contains keywords related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.""",
    input_variables=["question", "document"],
)

retrieval_grader = prompt_1 | llm | JsonOutputParser()

question = " 'Who is Haroon Sheikh?'"

docs = vectorstore.as_retriever().get_relevant_documents(question)
doc_txt = docs[0].page_content
print(query)
print('---------------------------------------------------------')
print(doc_txt)
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))
print('---------------------------------------------------------------')
doc_fake ='Mi primo pedro es muy listo y sabe trabajar con muchas tecnologías'
print(doc_fake)
print(retrieval_grader.invoke({"question": question, "document": doc_fake}))


Who is Haroon Sheikh?
---------------------------------------------------------
sector and one of the UK’s leading 
entrepreneurs and philanthropists. Along 
with his brother Farouq, he co-founded 
CareTech. As Group CEO he actively leads 
the day-to-day running of the Group 
and its international expansion, and has 
been instrumental in assembling a highly 
talented leadership team, to support 
the continued growth of the business. 
Haroon brings commercial acumen, 
related industry experience and property 
knowledge. He has a deep commitment
 {"score": "yes"}{'score': 'yes'}
---------------------------------------------------------------
Mi primo pedro es muy listo y sabe trabajar con muchas tecnologías
 {
"score": "no"
}{'score': 'no'}


In [46]:
#vectorstore.as_retriever(search_kwargs={"score_threshold": 200,"k": 4})

def eval_documents(retrieval, question):
    valid_docs = []
    docs = retrieval.get_relevant_documents(question)

    for d in docs: 
        score = retrieval_grader.invoke({"question": question, "document": d.page_content})
        if score['score'] == "yes":
            valid_docs.append(d)
        else:
            continue
    return valid_docs


In [47]:
vectorstore.as_retriever(search_kwargs={"k": 1}).get_relevant_documents(query)

[Document(page_content='sector and one of the UK’s leading \nentrepreneurs and philanthropists. Along \nwith his brother Farouq, he co-founded \nCareTech. As Group CEO he actively leads \nthe day-to-day running of the Group \nand its international expansion, and has \nbeen instrumental in assembling a highly \ntalented leadership team, to support \nthe continued growth of the business. \nHaroon brings commercial acumen, \nrelated industry experience and property \nknowledge. He has a deep commitment', metadata={'page': 40, 'source': 'DATA/annualreport.pdf'})]

In [48]:
eval_documents(vectorstore.as_retriever(search_kwargs={"k": 4}),'madre mia')

 { "score": "no" } { "score": "no" } {
"score": "no"
} {
"score": "no"
}

[]

In [49]:
vectorstore.as_retriever().get_relevant_documents('madre mia')

[Document(page_content='LogBUK. REHAVISTA is Germany’s largest \nprovider of augmentative and alternative \ncommunication (‘AAC’) products and \nservices. LogBUK is a subsidiary company to \nREHAVISTA, providing independent speech \nand language therapy to help AAC users \nachieve the best outcomes through specialist \nclinical support. Smartbox paid €10m in cash \non completion, funded by the Group’s debt \nfacility and post completion, CareTech will \nown 83% of Smartbox with the remaining', metadata={'page': 46, 'source': 'DATA/annualreport.pdf'}),
 Document(page_content='achieve the best outcomes through specialist \nclinical support.\nREHAVISTA’s reach and expertise is \nunparalleled in Germany, estimated to be the \nsecond largest funded AAC market globally \nafter the US. With its deep knowledge of \nassistive technology and established routes to \nmarket, this acquisition provides a significant \nopportunity for Smartbox to expand the \nproducts and services available in German

In [32]:
#Retriever personalizado
#Prueba se puede borrar
class ToyRetriever(BaseRetriever):
   
    documents: List[Document]
    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Sync implementations for retriever."""
        return self.documents
    

In [38]:
#Prueba se puede borrar

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=ToyRetriever(documents = eval_documents(vectorstore.as_retriever(search_kwargs={"k": 3}),query)),
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": memory,
    }
)

result = qa_chain({"query": query})
result

 {
"score": "yes"
} {
"score": "yes"
} {"score": "yes"}

[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are a knowledgeable chatbot, here to help with questions of the user.
    Your tone should be professional and informative.

    Context: sector and one of the UK’s leading 
entrepreneurs and philanthropists. Along 
with his brother Farouq, he co-founded 
CareTech. As Group CEO he actively leads 
the day-to-day running of the Group 
and its international expansion, and has 
been instrumental in assembling a highly 
talented leadership team, to support 
the continued growth of the business. 
Haroon brings commercial acumen, 
related industry experience and property 
knowledge. He has a deep commitment

facilitates a culture of openness and debate.
Group Chief Executive Officer
Haroon Sheikh is the Group CEO and 
accountable to the Board for the d

{'query': 'Who is Haroon Sheikh?',
 'result': " I have not enough context in order to answer who Haroon Sheikh was before the given context. However, according to the provided context, Haroon Sheikh is the Group CEO of CareTech, a leading UK care and support services provider. He is responsible for the day-to-day running of the business and its international expansion, as well as developing the overall commercial objectives and strategy in conjunction with the Board. Haroon also has a deep commitment to delivering high-quality care and support to people with complex needs, and is a Founder Trustee and Chairman of the CareTech Charitable Foundation. He is a graduate of the University of London and has extensive industry experience and property knowledge. Haroon is also Patron and Enterprise Fellow of the Prince's Trust and a member of the UK Advisory Council of the British Asian Trust. Haroon and his brother Farouq were winners of the Coutts Family Business Prize in 2008."}

In [51]:
#ok
class CustomRetriever(BaseRetriever):
    
    vs: VectorStore

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
        ) -> List[Document]:
        return eval_documents(self.vs.as_retriever(), query)

In [52]:
#ok
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=CustomRetriever(vs = vectorstore),
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": memory,
    }
)

result = qa_chain({"query": query})
result



[1m> Entering new RetrievalQA chain...[0m
 {"score": "yes"} {"score": "yes"} {"score": "yes"} {
"score": "yes"
}

[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are a knowledgeable chatbot, here to help with questions of the user.
    Your tone should be professional and informative.

    Context: sector and one of the UK’s leading 
entrepreneurs and philanthropists. Along 
with his brother Farouq, he co-founded 
CareTech. As Group CEO he actively leads 
the day-to-day running of the Group 
and its international expansion, and has 
been instrumental in assembling a highly 
talented leadership team, to support 
the continued growth of the business. 
Haroon brings commercial acumen, 
related industry experience and property 
knowledge. He has a deep commitment

facilitates a culture of openness and debate.
Group Chief Executive Officer
Haroon Sheikh is the Group CEO and 
accountable to the 

{'query': 'Who is Haroon Sheikh?',
 'result': ' Haroon Sheikh is the Group CEO of CareTech, a leading care and support services provider in the UK. He is responsible for the day-to-day running of the business and its international expansion. Haroon brings commercial acumen, industry experience, and property knowledge to his role. He also facilitates a culture of openness and debate within the organization. Haroon is a graduate of the University of London and serves as the Chairman of the Trustees for the CareTech Charitable Foundation. Additionally, he has received several awards and recognitions for his contributions to business and philanthropy.'}

In [58]:
result = qa_chain({"query": 'Que es CareTech?'})
result



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are a knowledgeable chatbot, here to help with questions of the user.
    Your tone should be professional and informative.

    Context: We have established our new 
CareTech Technology division 
to spearhead our technology 
and innovation agenda.
 
CareTech aims to become a digital leader 
in the disability and specialist social care 
sectors by developing an end-to-end 
pathway of innovative services that blend 
care and technology. Building on our 
maiden investment in Smartbox, we will 
accelerate the pace of our investment 
and development of innovative digital 
solutions to reach more people around

CareTech has 16 years on the public markets 
and grown to over 550 services supporting 
close to 5,000 service users. The business has 
increased its range of services to offer a broad 
Care Pathw

{'query': 'Que es CareTech?',
 'result': ' CareTech is a leading healthcare and support services company based in the UK. It was co-founded by Haroon Sheikh and his brother Farouq in 2004, and has since grown to over 550 services supporting close to 5,000 service users. CareTech aims to become a digital leader in the disability and specialist social care sectors by developing innovative digital solutions. The company offers high-quality services with a strong ethical and values-based approach, and has upper quartile ratings for both CQC and Ofsted. CareTech works closely with regulators and commissioners across England, Scotland, and Wales.'}