In [38]:
import pandas as pd
import numpy as np
import os
import json
import random
from tqdm import tqdm

In [39]:
from src.f_utils.embedding_utils import load_embeddings

In [40]:
df_labels = pd.read_csv("../dados/mimic/mimic-cxr-2.0.0-chexpert.csv")
df_labels.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [41]:
import faiss

# Load the FAISS index
index = faiss.read_index('artifacts/vector_store/faiss_img_aux.index')

# ids
ids = load_embeddings('artifacts/embeddings/study_ids.npy')

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)


In [42]:
from src.f_utils.evaluation import *
from src.f_utils.rag_search import search_relevant_cases
import torch

In [43]:
df_labels.fillna(0, inplace=True)
binary_cols = [col for col in df_labels.columns if col not in ['subject_id', 'study_id']]
for c in binary_cols:
    df_labels[c] = np.where(df_labels[c] == 1, 1, 0)

In [44]:
from src.f_utils.mimic_labels import _get_gabarito_any, _get_gabarito

In [45]:
from src.f_utils.embedding_utils import load_embeddings, extract_embedding_single_study, extract_embeddings_from_img, extract_embeddings_from_text, _extract_findings

In [46]:
emb_per_image = load_embeddings("artifacts/img_embeddings/embeddings_per_image.npy")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (377110,)


In [47]:
metadata = load_embeddings("artifacts/img_embeddings/image_metadata.npy")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (377110, 3)


In [48]:
def build_study_dataset(
        emb_per_image, 
        study_ids_path="artifacts/embeddings/study_ids.npy", 
        patient_ids_path="artifacts/embeddings/patient_ids.npy",
        text_emb_path="artifacts/embeddings/e_text.npy",
        labels_path="../dados/mimic/mimic-cxr-2.0.0-chexpert.csv"
    ):
    """
    Agrupa embeddings de imagens por estudo e combina com embeddings de texto e labels.
    
    Args:
        emb_per_image: lista de dicion√°rios com embeddings por imagem
        study_ids_path: caminho para arquivo com IDs dos estudos
        patient_ids_path: caminho para arquivo com IDs dos pacientes
        text_emb_path: caminho para arquivo com embeddings de texto
        labels_path: caminho para arquivo CSV com labels CheXpert
    
    Returns:
        Lista de dicion√°rios, um por estudo
    """
    import pandas as pd
    
    # Carregar dados alinhados por √≠ndice
    study_ids = load_embeddings(study_ids_path)
    patient_ids = load_embeddings(patient_ids_path)
    text_embeddings = load_embeddings(text_emb_path)
    
    # Carregar labels
    df_labels = pd.read_csv(labels_path)
    
    # Identificar colunas de labels (excluindo subject_id e study_id)
    label_columns = [col for col in df_labels.columns if col not in ['subject_id', 'study_id']]
    
    # Criar mapeamento study_id -> labels
    study_labels_dict = {}
    for _, row in df_labels.iterrows():
        study_id = 's' + str(int(row['study_id']))
        # Pegar labels onde o valor √© 1
        labels = [col for col in label_columns if row[col] == 1.0]
        study_labels_dict[study_id] = labels
    
    # Criar mapeamento study_id -> index
    study_id_to_index = {sid: idx for idx, sid in enumerate(study_ids)}
    
    # Agrupar imagens por study_id
    from collections import defaultdict
    studies_dict = defaultdict(lambda: {
        'files_images': [],
        'embedding_images': []
    })
    
    for img_data in emb_per_image:
        study_id = img_data['study_id']
        studies_dict[study_id]['files_images'].append(img_data['image_name'])
        studies_dict[study_id]['embedding_images'].append(img_data['embedding_image'])
        studies_dict[study_id]['patient_id'] = img_data['patient_id']
    
    # Construir lista final de estudos
    result = []
    for study_id, data in studies_dict.items():
        # Buscar o √≠ndice do estudo
        if study_id not in study_id_to_index:
            print(f"‚ö†Ô∏è Study ID {study_id} n√£o encontrado nos arquivos de embeddings")
            continue
            
        study_idx = study_id_to_index[study_id]
        
        study_dict = {
            "study_index": study_idx,
            "study_id": study_id,
            "patient_id": patient_ids[study_idx],
            "files_images": data['files_images'],
            "embedding_images": data['embedding_images'],
            "file_text": f"{study_id}.txt",
            "embedding_text": text_embeddings[study_idx],
            "study_labels": study_labels_dict.get(study_id, [])  # Labels do estudo ou lista vazia
        }
        result.append(study_dict)
    
    print(f"‚úÖ {len(result)} estudos processados")
    return result

In [49]:
studies_dataset = build_study_dataset(emb_per_image)
print(f"\n\nTotal de estudos: {len(studies_dataset)}")
print(f"Exemplo: {studies_dataset[0]}")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)
‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)
‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835, 1152)
‚úÖ 227835 estudos processados


Total de estudos: 227835
Exemplo: {'study_index': 0, 'study_id': 's50414267', 'patient_id': 'p10000032', 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg', '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'], 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
        0.06212181,  0.01484995], dtype=float32), array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
        0.05352064,  0.01552592], dtype=float32)], 'file_text': 's50414267.txt', 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
       -0.07638288, -0.01550421], dtype=float3

In [50]:
## AQUI PODEMOS TESTAR VARIOS ALPHAS
def calculate_study_embedding(emb_text, emb_images, alpha=0.5):
    # Convert to tensors if they are numpy arrays
    if isinstance(emb_text, np.ndarray):
        emb_text = torch.from_numpy(emb_text)
    
    # Convert list of numpy arrays to tensors
    if isinstance(emb_images[0], np.ndarray):
        emb_images = [torch.from_numpy(img) for img in emb_images]
    
    # === 3) faz pooling com imagens de entrada
    # Stacking para [N, D]
    emb_images = torch.stack(emb_images)  # [num_imagens, embedding_dim]
    
    # Pooling (m√©dia) ao longo das imagens
    emb_pool = emb_images.mean(dim=0)  # [embedding_dim]
    emb_pool = emb_pool / emb_pool.norm(dim=-1, keepdim=True)

    # === 4) fez media dos embeddings para embedding final
    e_study = alpha * emb_text + (1 - alpha) * emb_pool
    e_study = e_study / e_study.norm(dim=-1, keepdim=True)

    return e_study.detach().cpu().numpy()  # Return as numpy for FAISS

In [53]:
# para cada item de val_dataset, calcular os top k
# k=2

k=5

# queries = [] # lista de sets de labels para cada estudo de consulta
# retrived = [] # lista de listas de sets (resultados por query) retrieved[i][j] = labels do resultado j da query i

results = [] # Salva todos os resultados nessa lista
y_true = []
y_pred = []
binary_cols = [col for col in df_labels.columns if col not in ['subject_id', 'study_id']]

for study in tqdm(studies_dataset):
    
    # Get the study vector from the original index
    study_vector = study['embedding_images'][0] # ajustei a fun√ß√£o para pular o primeiro caso

    # emb_images = [torch.from_numpy(img) for img in study_vector]
    # # Stacking para [N, D]
    # emb_images = torch.stack(emb_images)  # [num_imagens, embedding_dim]

    # # Pooling (m√©dia) ao longo das imagens
    # emb_pool = emb_images.mean(dim=0)  # [embedding_dim]
    # emb_pool = emb_pool / emb_pool.norm(dim=-1, keepdim=True)

    # Search the filtered index for the top k most similar vectors
    estudos, idxs = search_relevant_cases(study_vector, index, ids, k)

    # Cria o gabarito para mandar pra fun√ß√£o
    try:
        gabarito = _get_gabarito(int(study['study_id'].replace('s', '')), df_labels)
    except:
        gabarito = df_labels

    # Criar mapeamento study_id -> labels
    gabarito_list_set = np.repeat(0, gabarito.shape[0])

    # Get the labels of the top k most similar vectors
    study_top_k = []
    for i in idxs:
        study_top_k.append({
            'study_id': studies_dataset[i]['study_index'],
            'study_id': studies_dataset[i]['study_id'],
            'labels': studies_dataset[i]['study_labels']
        })

    ground_truth_labels = set(study['study_labels'])
    list_set_top_k = [(set(s['labels'])) for s in study_top_k]

    metrics = evaluate_single_query(ground_truth_labels, list_set_top_k, gabarito_list_set, k)

    results.append(metrics)
    
    # True labels repeated
    for s in study_top_k:
        try: 
            y_true_i = df_labels.loc[df_labels.study_id == int(study['study_id'].replace('s', ''))][binary_cols].values.tolist()[0]
            y_pred_i = df_labels.loc[df_labels.study_id == int(s['study_id'].replace('s', ''))][binary_cols].values.tolist()[0]
            y_true.append(y_true_i)
            y_pred.append(y_pred_i)
        except:
            continue

# M√©dia final das m√©tricas
mean_metrics = {
    m: np.mean([res[m] for res in results])
    for m in results[0]
}

# results,mean_metrics = evaluate_dataset(queries, retrived, k=k) # avaliar o dataset inteiro

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 227835/227835 [2:12:04<00:00, 28.75it/s]  


In [54]:
mean_metrics

{'precision@k': 0.4222213443939692,
 'recall@k': 0.02084429653845491,
 'jaccard_1@k': 0.14184826738648582,
 'jaccard@k': 0.23704875518918578,
 'ndcg@k': 0.6011642801744962}

In [55]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names = binary_cols))

                            precision    recall  f1-score   support

               Atelectasis       0.25      0.25      0.25    229037
              Cardiomegaly       0.25      0.24      0.24    224223
             Consolidation       0.07      0.07      0.07     53885
                     Edema       0.18      0.17      0.18    135086
Enlarged Cardiomediastinum       0.05      0.05      0.05     35894
                  Fracture       0.03      0.03      0.03     21949
               Lung Lesion       0.04      0.04      0.04     31419
              Lung Opacity       0.25      0.25      0.25    257619
                No Finding       0.42      0.44      0.43    377260
          Pleural Effusion       0.31      0.30      0.31    271494
             Pleural Other       0.02      0.02      0.02     10055
                 Pneumonia       0.08      0.08      0.08     82776
              Pneumothorax       0.07      0.07      0.07     51789
           Support Devices       0.45      0.45

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [36]:
import os

# Salvar balanced_validation_dataset em artifacts/datasets

# Criar diret√≥rio se n√£o existir
os.makedirs("artifacts/resultados_aux", exist_ok=True)

# Salvar o dataset de valida√ß√£o balanceado
np.save("artifacts/resultados/results_img.npy", results)
np.save("artifacts/resultados/y_pred_img.npy", y_pred)
np.save("artifacts/resultados/y_true_img.npy", y_true)

In [57]:
results_all = np.load("artifacts/resultados/results_img.npy", allow_pickle = True)

# M√©dia final das m√©tricas
mean_metrics_all = {
    m: np.mean([res[m] for res in results_all])
    for m in results[0]
}

mean_metrics_all

{'precision@k': 0.4238119691882282,
 'recall@k': 0.020452094793698875,
 'jaccard_1@k': 0.14752167138499353,
 'jaccard@k': 0.24165068411226814,
 'ndcg@k': 0.5956861368174092}

In [56]:
from sklearn.metrics import classification_report
y_true_all = np.load("artifacts/resultados/y_true_img.npy", allow_pickle = True)
y_pred_all = np.load("artifacts/resultados/y_pred_img.npy", allow_pickle = True)
print(classification_report(y_true_all, y_pred_all, target_names = binary_cols))

                            precision    recall  f1-score   support

               Atelectasis       0.25      0.24      0.24    229036
              Cardiomegaly       0.25      0.23      0.24    224222
             Consolidation       0.06      0.06      0.06     53886
                     Edema       0.18      0.17      0.17    135087
Enlarged Cardiomediastinum       0.04      0.04      0.04     35894
                  Fracture       0.02      0.02      0.02     21950
               Lung Lesion       0.03      0.03      0.03     31419
              Lung Opacity       0.25      0.24      0.25    257614
                No Finding       0.43      0.46      0.44    377262
          Pleural Effusion       0.32      0.29      0.30    271492
             Pleural Other       0.01      0.01      0.01     10054
                 Pneumonia       0.08      0.07      0.08     82773
              Pneumothorax       0.07      0.07      0.07     51789
           Support Devices       0.48      0.44

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [37]:
at_least_one = 0
one_with_label = 0
for r in results:
    if r['jaccard_1@k'] > 0:
        at_least_one += 1
    if r['jaccard@k'] > 0:
        one_with_label += 1

print(f'Estudos em que pelo menos um retorno tinha exatamente os mesmo labels: {at_least_one}/{len(results)}')
print(f'Estudos em que pelo menos um retorno tinha pelo menos um label igual: {one_with_label}/{len(results)}')

Estudos em que pelo menos um retorno tinha exatamente os mesmo labels: 69774/227835
Estudos em que pelo menos um retorno tinha pelo menos um label igual: 181421/227835
