In [4]:
from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader, PyMuPDFLoader
import os

#pdf2image
#pdfminer

In [5]:
rute = os.path.join("..", "..", "Data", "input", "billionaires_page.pdf")
rute

'..\\..\\Data\\input\\billionaires_page.pdf'

In [10]:
test2 = PyPDFLoader(rute).load()
test2[0].metadata

{'source': '..\\..\\Data\\input\\billionaires_page.pdf', 'page': 0}

In [14]:
test3 = PyMuPDFLoader(rute).load()
test3[0].metadata

{'source': '..\\..\\Data\\input\\billionaires_page.pdf',
 'file_path': '..\\..\\Data\\input\\billionaires_page.pdf',
 'page': 0,
 'total_pages': 33,
 'format': 'PDF 1.4',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
 'producer': 'Skia/PDF m114',
 'creationDate': "D:20230702063129+00'00'",
 'modDate': "D:20230702063129+00'00'",
 'trapped': ''}

In [13]:
from langchain_community.document_loaders import MathpixPDFLoader
loader = MathpixPDFLoader(rute)
data = loader.load()
data[0].page_content

ValueError: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.

### PARENT DOCUMENT RETIEVE

In [1]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import FAISS 

from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

from langchain_community.document_loaders import BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os

In [2]:
def embedding(model_name): 
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
  if model_name == "text-embedding-3-large":
     return OpenAIEmbeddings(model = model_name)
  return HuggingFaceBgeEmbeddings(model_name = model_name, encode_kwargs = encode_kwargs)

def data_loader(uploaded_file):
  if uploaded_file[-4:] == 'html': 
    loader = BSHTMLLoader(uploaded_file, open_encoding='utf-8')
    doc = loader.load()
  elif uploaded_file[-3:] == "pdf":
    loader = PyMuPDFLoader(uploaded_file)
    doc = loader.load()
  return doc 

In [3]:
documents = []

for uploaded_file in ['Create a fulfilment to authorize a service _ Healthanea Documentation.html', 'Create and retrieve DHP user IDs _ Healthanea Documentation.html']:
    file_path = os.path.join('..', '..', 'Data', 'input', uploaded_file)
    doc = data_loader(file_path)
    documents.append(doc)

documents

[[Document(page_content='\n\n\n\nCreate a fulfilment to authorize a service | Healthanea Documentation \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© DHP SAS\n Version: 160211a-facd43ecf \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate a fulfilment to authorize a service\nSee also the API specification (Swagger) and Error management\n\nSee also:\n\nConcept: Service fulfilment\nAPI documentation: The Fulfilment API\n\nProducer Channel role: You use the Fulfilment API to create a fulfilment instance with which you authorize Service Providers to offer healthcare services to end-users.\nService role: You use the Fulfilment API to register actions related to your service\'s lifecycle within the context of the fulfilment instance. These can include actions performed by the end-user but also the read and write actions applied to the end-user\'s health data.\nThe Fulfilment API tracks the actions performed by the health services and the end-

In [16]:
def ParentDocument(documents, embeddings, k = 1):
  """The ParentDocumentRetriever strikes that balance by splitting and storing small chunks of data. 
  During retrieval, it first fetches the small chunks but then looks up the parent ids for those chunks and returns those larger documents."""

  parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000)
  child_splitter = RecursiveCharacterTextSplitter(chunk_size = 400)
  vector = FAISS.from_texts([doc[0].page_content for doc in documents], embeddings)
  store = InMemoryStore()

  retriever = ParentDocumentRetriever(
    vectorstore = vector,
    docstore = store, 
    child_splitter = child_splitter,
    parent_splitter = parent_splitter,
    search_kwargs = {'k': k}
  )
  for doc in documents:
    retriever.add_documents(doc, ids = None)
  return retriever 

In [17]:
embeddings = embedding('BAAI/bge-small-en-v1.5')

pr = ParentDocument(documents, embeddings)
pr



ParentDocumentRetriever(vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019A76A52270>, docstore=<langchain_core.stores.InMemoryBaseStore object at 0x0000019A76A53DD0>, search_kwargs={'k': 1}, child_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000019A76A52360>, parent_splitter=<langchain_text_splitters.character.RecursiveCharacterTextSplitter object at 0x0000019A1F5F9160>)

In [18]:
pr.invoke("How can I get the properties of a fulfilment?")

[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read 

In [3]:
loader = BSHTMLLoader('../../Data/input/Create a fulfilment to authorize a service _ Healthanea Documentation.html', open_encoding = 'utf-8')
documents = loader.load()
documents

[Document(page_content='\n\n\n\nCreate a fulfilment to authorize a service | Healthanea Documentation \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© DHP SAS\n Version: 160211a-facd43ecf \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate a fulfilment to authorize a service\nSee also the API specification (Swagger) and Error management\n\nSee also:\n\nConcept: Service fulfilment\nAPI documentation: The Fulfilment API\n\nProducer Channel role: You use the Fulfilment API to create a fulfilment instance with which you authorize Service Providers to offer healthcare services to end-users.\nService role: You use the Fulfilment API to register actions related to your service\'s lifecycle within the context of the fulfilment instance. These can include actions performed by the end-user but also the read and write actions applied to the end-user\'s health data.\nThe Fulfilment API tracks the actions performed by the health services and the end-u

In [4]:
embeddings = embedding('BAAI/bge-small-en-v1.5')



In [12]:
def ParentDocument(documents, embeddings, k):
  """The ParentDocumentRetriever strikes that balance by splitting and storing small chunks of data. 
  During retrieval, it first fetches the small chunks but then looks up the parent ids for those chunks and returns those larger documents."""

  parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000)
  child_splitter = RecursiveCharacterTextSplitter(chunk_size = 400)
  vector = FAISS.from_texts([doc.page_content for doc in documents], embeddings)
  store = InMemoryStore()

  retriever = ParentDocumentRetriever(
    vectorstore = vector,
    docstore = store, 
    child_splitter = child_splitter,
    parent_splitter = parent_splitter,
    search_kwargs = {'k': k}
  )
  retriever.add_documents(documents, ids = None)
  return retriever 

In [13]:
retriever = ParentDocument(documents, embeddings, 1).invoke("How can I get the properties of a fulfilment?")
retriever

[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read 

In [14]:
len(retriever[0].page_content)

1991

In [6]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size = 400)

In [8]:
vector = FAISS.from_texts(documents[0].page_content, embeddings)
store = InMemoryStore()

In [10]:
retriever = ParentDocumentRetriever(
  vectorstore = vector,
  docstore = store, 
  child_splitter = child_splitter,
  parent_splitter = parent_splitter,
  search_kwargs = {'k': 1}
)

In [11]:
retriever.add_documents(documents, ids = None)

In [19]:
pr2 = vector.similarity_search("How can I get the properties of a fulfilment?")
pr2[0].page_content

'{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}'

In [20]:
len(pr2[0].page_content)

158

In [15]:
pr = retriever.invoke("How can I get the properties of a fulfilment?")
pr[0].page_content

'In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read the properties of an ex

In [16]:
len(pr[0].page_content)

1991

#### MultiQuery Retriever

In [1]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain_community.vectorstores import FAISS 


from langchain_community.document_loaders import BSHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import os

os.environ['GROQ_API_KEY'] = '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'

In [2]:
def embedding(model_name): 
  encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
  if model_name == "text-embedding-3-large":
     return OpenAIEmbeddings(model = model_name)
  return HuggingFaceBgeEmbeddings(model_name = model_name, encode_kwargs = encode_kwargs)

def data_loader(uploaded_file):
  if uploaded_file[-4:] == 'html': 
    loader = BSHTMLLoader(uploaded_file, open_encoding='utf-8')
    doc = loader.load()
  elif uploaded_file[-3:] == "pdf":
    loader = PyMuPDFLoader(uploaded_file)
    doc = loader.load()
  return doc 

In [3]:
loader = BSHTMLLoader('../../Data/input/Create a fulfilment to authorize a service _ Healthanea Documentation.html', open_encoding = 'utf-8')
documents = loader.load()
documents

[Document(page_content='\n\n\n\nCreate a fulfilment to authorize a service | Healthanea Documentation \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© DHP SAS\n Version: 160211a-facd43ecf \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate a fulfilment to authorize a service\nSee also the API specification (Swagger) and Error management\n\nSee also:\n\nConcept: Service fulfilment\nAPI documentation: The Fulfilment API\n\nProducer Channel role: You use the Fulfilment API to create a fulfilment instance with which you authorize Service Providers to offer healthcare services to end-users.\nService role: You use the Fulfilment API to register actions related to your service\'s lifecycle within the context of the fulfilment instance. These can include actions performed by the end-user but also the read and write actions applied to the end-user\'s health data.\nThe Fulfilment API tracks the actions performed by the health services and the end-u

In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1800)
chunks = splitter.split_documents(documents)
chunks

[Document(page_content="Create a fulfilment to authorize a service | Healthanea Documentation \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© DHP SAS\n Version: 160211a-facd43ecf \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate a fulfilment to authorize a service\nSee also the API specification (Swagger) and Error management\n\nSee also:\n\nConcept: Service fulfilment\nAPI documentation: The Fulfilment API\n\nProducer Channel role: You use the Fulfilment API to create a fulfilment instance with which you authorize Service Providers to offer healthcare services to end-users.\nService role: You use the Fulfilment API to register actions related to your service's lifecycle within the context of the fulfilment instance. These can include actions performed by the end-user but also the read and write actions applied to the end-user's health data.\nThe Fulfilment API tracks the actions performed by the health services and the end-user, thus 

In [5]:
embeddings = embedding('BAAI/bge-small-en-v1.5')



In [6]:
vector = FAISS.from_documents(chunks, embeddings)
retirever = vector.as_retriever(search_kwargs = {"k": 5})

In [8]:
question = "How can I get the properties of a fulfilment?"
llm = ChatGroq(temperature = 0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever = vector.as_retriever(), llm = llm
)

In [9]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [10]:
unique_docs = retriever_from_llm.invoke(question)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. "What are the attributes of a given fulfillment?"', '2. "Could you provide the details of a specific fulfillment\'s properties?"', '3. "How can I retrieve the information related to a particular fulfillment?"']


5

In [11]:
unique_docs

[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read 

#### Reranker Methods

In [12]:
from langchain_community.document_transformers import LongContextReorder

reordering = LongContextReorder()
reordered_docs = reordering.transform_documents(unique_docs)
reordered_docs

[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read 

In [16]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_community.document_compressors import JinaRerank
#from langchain_community.document_compressors.rankllm_rerank import RankLLMRerank

os.environ['JINA_API_KEY'] = 'jina_eba96965c69b4fd894f57138b933d4195yirZCsUcmqh6-7DeqSi1Uu9DaHg'

In [17]:
compressor = JinaRerank(top_n = 5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor = compressor, base_retriever = retirever, 
)

compressed_docs = compression_retriever.invoke(
    "How can I get the properties of a fulfilment?"
)
compressed_docs

  warn_deprecated(


[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance to which to add the context properties.\nYou must also include a request body that contains a properties object containing one or more key-value pairs of name and value.  An example of a request body is the following:\n{\n  "properties": [\n    {\n      "name": "{{name1}}",\n      "value": "{{value}}"\n    },\n    {\n      "name": "{{name2}}",\n      "value": "{{value}}"\n    }\n  ]\n}\n\nIf the request is successful, Healthanea returns a 202 HTTP status code confirming that the context information was added to the fulfilment instance.\n4 - Get the properties of the fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one already, get a token for the Producer Channel role (see Authentication & tokens).\nUse a GET on the fulfilment/instance/{fulfilmentId} endpoint to read 

In [18]:
from langchain.retrievers.document_compressors import FlashrankRerank

compressor = FlashrankRerank(model = 'ms-marco-MiniLM-L-12-v2', top_n = 5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor = compressor, base_retriever = retirever
)


In [15]:
%%time
reranked_docs = compression_retriever.invoke('How can I get the properties of a fulfilment?')
reranked_docs

Running pairwise ranking..
CPU times: total: 7.31 s
Wall time: 1.92 s


[Document(page_content='In the request URL, you must include a fulfilmentId, the identifier of the fulfilment instance whose properties you want to retrieve.\nIf the request is successful, Healthanea returns a 200 HTTP status code with a response payload containing the properties of the fulfilment instance in the following format:\n{\n  "id": "{{id}}",\n  "producerId": "{{producerId}}",\n  "producerChannelId": "{{producerChannelId}}",\n  "serviceId": "{{serviceId}}",\n  "serviceProviderId": "{{serviceProviderId}}",\n  "journeyId": "{{journeyId}}",\n  "deviceId": "{{deviceId}}",\n  "dhpUserId": "{{dhpUserId}}",\n  "timestamp": "{{timestamp}}",\n  "expirationTimestamp": "{{expirationTimestamp}}",\n  "dataExchangeCompletionActionTypeIds": [\n    "{{dataExchangeCompletionActionTypeId}}"\n  ]\n}\n\n5 - Get the payload of an action in a fulfilment instance\n\n\nRoles:\nProducer Channel & Service\n\n\nHealthanea API:\nFulfilment\n\n\nMandatory/optional:\nOptional\n\n\nIf you do not have one a