In [1]:
import pandas as pd
import numpy as np
import os
import json
import random
from tqdm import tqdm

In [2]:
from src.f_utils.embedding_utils import load_embeddings

# Dados

## dataset

In [3]:
val_dataset = load_embeddings('artifacts/datasets/balanced_validation_dataset.npy')

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (2279,)


In [4]:
val_dataset[0]

{'study_index': 0,
 'study_id': 's50414267',
 'patient_id': 'p10000032',
 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
  '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'],
 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
          0.06212181,  0.01484995], dtype=float32),
  array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
          0.05352064,  0.01552592], dtype=float32)],
 'file_text': 's50414267.txt',
 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
        -0.07638288, -0.01550421], dtype=float32),
 'study_labels': ['No Finding']}

## labels

In [5]:
df_labels = pd.read_csv("../dados/mimic/mimic-cxr-2.0.0-chexpert.csv")
df_labels.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [6]:
val_study_ids = [study['study_id'] for study in val_dataset]
# extrair 's' de 'study_id'
val_study_ids = [int(study_id[1:]) for study_id in val_study_ids]

# filtrar df_labels por val_study_ids
df_labels_val = df_labels[df_labels['study_id'].isin(val_study_ids)]
df_labels_val.shape

(2279, 16)

## Vector Store

In [7]:
import faiss

# Load the FAISS index
index = faiss.read_index('artifacts/vector_store/faiss_img.index')

# Get the study indices from validation dataset
val_study_indices = [item['study_index'] for item in val_dataset]

# Create a new index with only the validation data
# First, we need to get the vectors corresponding to the validation indices
val_vectors = []
for study_idx in val_study_indices:
    # Reconstruct the vector from the original index
    vector = index.reconstruct(study_idx)
    val_vectors.append(vector)

# Convert to numpy array
val_vectors = np.array(val_vectors)

# Create new FAISS index with filtered data
filtered_index = faiss.IndexFlatIP(val_vectors.shape[1])  # or IndexFlatL2 depending on your original index
filtered_index.add(val_vectors)

# Save the filtered index
# faiss.write_index(filtered_index, 'artifacts/vector_store/faiss_validation_filtered.index')

In [8]:
filtered_index.ntotal

2279

## IDs

In [9]:
ids = load_embeddings('artifacts/embeddings/study_ids.npy')

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)


In [10]:
ids[0]

's50414267'

# Avalia√ß√£o

O que queremos ver?  
  
1) jaccard indice: interseccao labels / uniao labels do estudo de entrada vs. estudo recuperado
    - proporcao de estudos recuperados em que jaccard = 1 (deu match em todas as labels)
    - m√©dia do indice de jaccard
2) recall/f1 sei l√°:
    - cria df_gabarito: todos os casos relevantes - em que todos os labels batem ou em que pelo menos 1 bate??
        - df_gabarito √© o graund truth (GT) = casos relevantes
        - casos retornados pelo rag = RC (rag cases) = predicted
    - dos casos retornados pelo sistema rag quantos estao em df_gabarito?
        - precision: casos relevantes retornados/casos retornados
            - (RC intersecao com GT) / (all RC)
        - recall: casos relevantes retornados/casos relevantes
             - (RC intersecao com GT) / (all GT)
3) m√©tricas por classe?


Multiclasse:  
Os valores de precis√£o e recall tamb√©m podem ser calculados para problemas de classifica√ß√£o com mais de duas classes. Para obter a **precis√£o** para uma determinada classe, *dividimos o n√∫mero de verdadeiros positivos pelo vi√©s do classificador em rela√ß√£o a essa classe (n√∫mero de vezes que o classificador previu a classe)*. Para calcular o **recall** para uma determinada classe, *dividimos o n√∫mero de verdadeiros positivos pela preval√™ncia dessa classe (n√∫mero de vezes que a classe ocorre na amostra de dados)*.

eu pensei em definir as metricas como:
- `recall@k`: casos relevantes retornados nos top k/casos relevantes
- `precision@k`: casos relevantes retornados/casos retornados
- `jaccard_1@k`: propor√ßao dos top k casos em que jaccard=1
- `jaccard@k`: media dos jaccard pros top k - qu√£o similares realmente s√£o os casos retornados
- `NDGC@k`: Normalized Discounted Cumulative Gain (Ganho Cumulativo Descontado Normalizado) - avaliar o qu√£o bem um sistema classifica os itens por relev√¢ncia para um usu√°rio

In [11]:
from src.f_utils.evaluation import *
from src.f_utils.rag_search import search_relevant_cases
import torch

In [12]:
val_dataset[0]

{'study_index': 0,
 'study_id': 's50414267',
 'patient_id': 'p10000032',
 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
  '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'],
 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
          0.06212181,  0.01484995], dtype=float32),
  array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
          0.05352064,  0.01552592], dtype=float32)],
 'file_text': 's50414267.txt',
 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
        -0.07638288, -0.01550421], dtype=float32),
 'study_labels': ['No Finding']}

In [13]:
filtered_index

<faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x7b0cf87d37b0> >

## teste caso √∫nico

In [15]:
study

{'study_index': 0,
 'study_id': 's50414267',
 'patient_id': 'p10000032',
 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
  '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'],
 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
          0.06212181,  0.01484995], dtype=float32),
  array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
          0.05352064,  0.01552592], dtype=float32)],
 'file_text': 's50414267.txt',
 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
        -0.07638288, -0.01550421], dtype=float32),
 'study_labels': ['No Finding']}

In [17]:
# para cada item de val_dataset, calcular os top k
# k=2

study = val_dataset[0]

# Get the study index from the original index
study_index = study['study_index']
print(study_index)

# Get the study vector from the original index
study_vector = study['embedding_images'] # ajustei a fun√ß√£o para pular o primeiro caso

emb_images = [torch.from_numpy(img) for img in study_vector]
# Stacking para [N, D]
emb_images = torch.stack(emb_images)  # [num_imagens, embedding_dim]

# Pooling (m√©dia) ao longo das imagens
emb_pool = emb_images.mean(dim=0)  # [embedding_dim]
emb_pool = emb_pool / emb_pool.norm(dim=-1, keepdim=True)
print(emb_pool)

# Search the filtered index for the top k most similar vectors
k = 5
estudos, idxs = search_relevant_cases(emb_pool, filtered_index, ids, k)
print(estudos) # study_ids
print(idxs)    # study_indices

# Get the labels of the top k most similar vectors
study_top_k = []
for i in idxs:
    study_top_k.append({
        'study_id': val_dataset[i]['study_index'],
        'study_id': val_dataset[i]['study_id'],
        'labels': val_dataset[i]['study_labels']
    })
print(study_top_k)

list_set_top_k = [(set(study['labels'])) for study in study_top_k]
print(list_set_top_k)

# Get the ground truth labels for the study
ground_truth_labels = set(study['study_labels'])
print(ground_truth_labels)

# Create a list of sets with all the labels from all the studys with at least one similar label
from src.f_utils.mimic_labels import _get_gabarito_any

gabarito = _get_gabarito_any(int(study['study_id'].replace('s', '')), df_labels_val)

# Criar mapeamento study_id -> labels
binary_cols = [col for col in df_labels_val.columns if col not in ['subject_id', 'study_id']]
gabarito_list_set = []
for _, row in gabarito.iterrows():
    # Pegar labels onde o valor √© 1
    labels = [col for col in binary_cols if row[col] == 1.0]
    gabarito_list_set.append(set(labels))

# Evalueate the results
evaluate_single_query(ground_truth_labels, list_set_top_k, gabarito_list_set, k)

0
tensor([-0.0283,  0.0335,  0.0084,  ..., -0.0148,  0.0584,  0.0153])
['s55135339', 's53583135', 's58230749', 's58051413', 's50867638']
[1351 1370  384 1872  803]
[{'study_id': 's54771176', 'labels': ['No Finding']}, {'study_id': 's51620571', 'labels': ['No Finding']}, {'study_id': 's54440894', 'labels': ['No Finding']}, {'study_id': 's59406568', 'labels': ['No Finding']}, {'study_id': 's55538863', 'labels': ['No Finding']}]
[{'No Finding'}, {'No Finding'}, {'No Finding'}, {'No Finding'}, {'No Finding'}]
{'No Finding'}


{'precision@k': 1.0,
 'recall@k': 0.006622516556291391,
 'jaccard_1@k': 1.0,
 'jaccard@k': 1.0,
 'ndcg@k': 1.0}

## para todo o val_dataset

In [18]:
# para cada item de val_dataset, calcular os top k
# k=2

k=5

# queries = [] # lista de sets de labels para cada estudo de consulta
# retrived = [] # lista de listas de sets (resultados por query) retrieved[i][j] = labels do resultado j da query i

results = [] # Salva todos os resultados nessa lista

for study in tqdm(val_dataset):

    # Get the embedding vector for the study
    study_vector = study['embedding_images'] # ajustei a fun√ß√£o para pular o primeiro caso
    
    # Get the study vector from the original index
    study_vector = study['embedding_images'] # ajustei a fun√ß√£o para pular o primeiro caso

    emb_images = [torch.from_numpy(img) for img in study_vector]
    # Stacking para [N, D]
    emb_images = torch.stack(emb_images)  # [num_imagens, embedding_dim]

    # Pooling (m√©dia) ao longo das imagens
    emb_pool = emb_images.mean(dim=0)  # [embedding_dim]
    emb_pool = emb_pool / emb_pool.norm(dim=-1, keepdim=True)

    # Search the filtered index for the top k most similar vectors
    estudos, idxs = search_relevant_cases(emb_pool, filtered_index, ids, k)

    # Cria o gabarito para mandar pra fun√ß√£o
    gabarito = _get_gabarito_any(int(study['study_id'].replace('s', '')), df_labels_val)

    # Criar mapeamento study_id -> labels
    binary_cols = [col for col in df_labels_val.columns if col not in ['subject_id', 'study_id']]
    gabarito_list_set = []
    for _, row in gabarito.iterrows():
        # Pegar labels onde o valor √© 1
        labels = [col for col in binary_cols if row[col] == 1.0]
        gabarito_list_set.append(set(labels))

    
    # Get the labels of the top k most similar vectors
    study_top_k = []
    for i in idxs:
        study_top_k.append({
            'study_id': val_dataset[i]['study_index'],
            'study_id': val_dataset[i]['study_id'],
            'labels': val_dataset[i]['study_labels']
        })

    ground_truth_labels = set(study['study_labels'])
    list_set_top_k = [(set(s['labels'])) for s in study_top_k]

    metrics = evaluate_single_query(ground_truth_labels, list_set_top_k, gabarito_list_set, k)

    results.append(metrics)

# M√©dia final das m√©tricas
mean_metrics = {
    m: np.mean([res[m] for res in results])
    for m in results[0]
}

# results,mean_metrics = evaluate_dataset(queries, retrived, k=k) # avaliar o dataset inteiro

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2279/2279 [02:43<00:00, 13.90it/s]


In [19]:
mean_metrics

{'precision@k': 0.35410267661254935,
 'recall@k': 0.0021742397563200857,
 'jaccard_1@k': 0.27283896445809563,
 'jaccard@k': 0.3062467874380994,
 'ndcg@k': 0.41437501059103277}

In [20]:
results[0]

{'precision@k': 1.0,
 'recall@k': 0.006622516556291391,
 'jaccard_1@k': 1.0,
 'jaccard@k': 1.0,
 'ndcg@k': 1.0}