### DB test


In [1]:
import os
import chromadb
from langchain_chroma import Chroma
from langchain_core.documents import Document
import yaml
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch


In [4]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)

os.environ['OPENAI_API_KEY'] = config['API_key']

In [3]:
df = pd.read_parquet(config['parquet_path'])
raw_documents = df[['Historico', 'Unidade', 'ElemDespesaTCE', 'Credor', 'Vlr_Empenhado']]


In [4]:
elems = pd.unique(raw_documents['ElemDespesaTCE'])
index = 7
mask = (raw_documents['ElemDespesaTCE'] == elems[index]).values
X_grouped = raw_documents.iloc[mask]

In [None]:
samples = X_grouped.astype(str).agg(', '.join, axis=1) # adicionar uma virgula para cada atributo

In [6]:
samples.iloc[0]

'TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0'

In [7]:
print(len(samples))

4157


In [8]:
type(samples)

pandas.core.series.Series

In [5]:
# # Load a pre-trained transformer model for embeddings
model_name = config['embedding_model']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [6]:
def create_embeddings(samples, batch_size=64):
    all_embeddings = []

    for i in range(0, len(samples), batch_size):
        batch = samples[i:i+batch_size].tolist()
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
            all_embeddings.append(embeddings)

    return torch.cat(all_embeddings, dim=0).numpy()

In [11]:
embeddings = create_embeddings(samples)

### Initializing Chromadb and collection

In [12]:
persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("index_7_collection")

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="index_7_collection",
    embedding_function=model,
    persist_directory=config['vector_store_dir']
)

In [13]:
documents = [
    Document(
        page_content=row,
        # metadata={
        #     'Cluster': cluster[_]
        # }
    )
    for _, row in enumerate(samples)
]

assert len(documents) == len(samples), "Mismatch between documents and embeddings!"

In [14]:
BATCH_SIZE = 5461
for i in range(0, len(documents), BATCH_SIZE):
    if i + BATCH_SIZE > len(documents):
        batch_docs = documents[i:]
    else:
        batch_docs = documents[i:i + BATCH_SIZE]
    collection.add(
        documents=[doc.page_content for doc in batch_docs],
        # metadatas=[doc.metadata for doc in batch_docs],
        embeddings=embeddings[i:i + BATCH_SIZE].tolist(),
        ids=[f"doc_{j}" for j in range(i, i + len(batch_docs))]
    )

In [15]:
collection

Collection(name=index_7_collection)

In [16]:
samples.iloc[0]

'TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0'

### Query

In [17]:
query_str = "TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0"

embed_query = create_embeddings(pd.Series(query_str))[0]

In [18]:
type(embed_query)

numpy.ndarray

In [19]:
results = vector_store_from_client.similarity_search_by_vector(
    embedding=embed_query,
    k=2,
    # filter={"Cluster": "5"},
)
print(results)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

[Document(id='doc_0', metadata={}, page_content='TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0'), Document(id='doc_6', metadata={}, page_content='MM 0529 2018   TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE  DO MES DE JULHO 2018  PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 45545.65')]
* TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0 [{}]
* MM 0529 2018   TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE  DO MES DE JULHO 2018  PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 45545.65 [{}]


### Inspecting DB

In [20]:
collection = persistent_client.get_collection("index_7_collection")


In [21]:
results = collection.get(include=['documents'], limit=1)
print(results['documents'])

['TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0']


In [22]:
results = collection.get(include=["metadatas", "embeddings"], limit=1)
print(results["metadatas"])
print(type(results["embeddings"]))


[None]
<class 'numpy.ndarray'>


### Accessing it from the config['vector_store_dir']

Restart kernel of notebook and skip all previous steps

In [7]:
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma

# Define where your DB was saved
persist_dir = 'chroma/'

# Reconnect to the persisted DB
persistent_client = PersistentClient(path='chroma/')


# Reconnect to the vector store
vector_store = Chroma(
    client=persistent_client,
    collection_name="my_collection",
    embedding_function=model,
    persist_directory=persist_dir,
)


  vector_store = Chroma(


In [8]:
collection = persistent_client.get_collection("my_collection")
print(collection.count())


4157


In [21]:
query_str = "TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL"

embed_query = create_embeddings(pd.Series(query_str))[0]

In [10]:
results = vector_store.similarity_search_by_vector(
    embedding=embed_query,
    k=2,
    # filter={"Cluster": "5"},
)
print(results)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

[Document(metadata={}, page_content='PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES OUTUBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL'), Document(metadata={}, page_content='PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES NOVEMBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL')]
* PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES OUTUBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL [{}]
* PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES NOVEMBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL [{}]


In [18]:
string = results[0].page_content
metadata = results[0].metadata
parts = string.split(',')

In [17]:
parts

['PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES OUTUBRO 2018   PROC 2018002796',
 ' PREFEITURA ANGRA DOS REIS',
 ' CONTRIBUICAO PARA O FGTS',
 ' CAIXA ECONOMICA FEDERAL']

### filtrar por similaridade manualmente

In [22]:
query_str = "TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT PREFEITURA ANGRA DOS REIS CONTRIBUICAO PARA O FGTS CAIXA ECONOMICA FEDERAL 0.0"

embed_query = create_embeddings(pd.Series(query_str))[0]

In [41]:
collection

Collection(name=my_collection)

In [23]:
results = collection.query(
    query_embeddings=[embed_query],
    n_results=1000,  # or total documents if small
    include=["distances", "documents", "metadatas"]
)


In [29]:
results['distances'][0] # Em L2 (Euclidean) distance, valores menores significam maior similaridade.

[1.3585524559020996,
 1.3799680471420288,
 1.4081311225891113,
 1.5090551376342773,
 1.5266366004943848,
 1.5677380561828613,
 1.6398979425430298,
 1.6398979425430298,
 1.6398979425430298,
 1.6398979425430298,
 1.7164099216461182,
 1.735884428024292,
 1.7554306983947754,
 1.812682867050171,
 1.907652497291565,
 1.9175355434417725,
 1.9175355434417725,
 1.9244868755340576,
 1.9807887077331543,
 2.007307291030884,
 2.007307291030884,
 2.007307291030884,
 2.007307291030884,
 2.050076484680176,
 2.050076484680176,
 2.104020595550537,
 2.125044822692871,
 2.1446144580841064,
 2.1565585136413574,
 2.205500602722168,
 2.2405292987823486,
 2.2475247383117676,
 2.2583467960357666,
 2.2776591777801514,
 2.3021745681762695,
 2.3024208545684814,
 2.3048248291015625,
 2.3084044456481934,
 2.3318092823028564,
 2.339801788330078,
 2.339801788330078,
 2.339801788330078,
 2.339801788330078,
 2.339801788330078,
 2.3428969383239746,
 2.3485217094421387,
 2.349231004714966,
 2.3499574661254883,
 2.3518998

In [30]:
threshold = 1.5

filtered = [
    {
        "document": doc,
        "metadata": meta,
        "distance": dist
    }
    for doc, meta, dist in zip(
        results["documents"][0],
        results["metadatas"][0],
        results["distances"][0]
    )
    if dist <= threshold
]

In [43]:
filtered

[{'document': 'PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES OUTUBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL',
  'metadata': None,
  'distance': 1.3585524559020996},
 {'document': 'PAGAMENTO DE DESPESA REFERENTE A FGTS DOS FUNCIONARIOS DA SECRETARIA DE SAUDE   MES NOVEMBRO 2018   PROC 2018002796, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL',
  'metadata': None,
  'distance': 1.3799680471420288},
 {'document': 'TRATA SE DE DESPESA COM PAGAMENTO DE FGTS DOS SERVIDORES DA SAUDE NO MES DE JANEIRO DE 2018  RAT, PREFEITURA ANGRA DOS REIS, CONTRIBUICAO PARA O FGTS, CAIXA ECONOMICA FEDERAL',
  'metadata': None,
  'distance': 1.4081311225891113}]

In [40]:
filtered[0]['metadata']