# Imports

In [2]:
import pandas as pd
import numpy as np

# Fun√ß√µes

In [3]:
from src.f_utils.embedding_utils import load_embeddings, extract_embedding_single_study, extract_embeddings_from_img, extract_embeddings_from_text, _extract_findings

In [4]:
emb_per_image = load_embeddings("artifacts/img_embeddings/embeddings_per_image.npy")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (377110,)


In [5]:
emb_per_image[0]

{'patient_id': 'p10000032',
 'study_id': 's50414267',
 'image_name': '02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
 'embedding_image': array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
         0.06212181,  0.01484995], dtype=float32)}

In [6]:
metadata = load_embeddings("artifacts/img_embeddings/image_metadata.npy")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (377110, 3)


In [7]:
metadata[0]

array(['p10000032', 's50414267',
       '02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg'], dtype=object)

dict_final = {
    "study_index": index do estudo,
    "study_id": id do estudo,
    "patient_id": id do paciente,
    "files_images": [lista nomes de arquivos das imagens],
    "embedding_images": [lista de embeddings das imagens para cada arquivo],
    "file_text": arquivo do laudo,
    "embedding_text": embedding do laudo
}

In [35]:
def build_study_dataset(
        emb_per_image, 
        study_ids_path="artifacts/embeddings/study_ids.npy", 
        patient_ids_path="artifacts/embeddings/patient_ids.npy",
        text_emb_path="artifacts/embeddings/e_text.npy",
        labels_path="../dados/mimic/mimic-cxr-2.0.0-chexpert.csv"
    ):
    """
    Agrupa embeddings de imagens por estudo e combina com embeddings de texto e labels.
    
    Args:
        emb_per_image: lista de dicion√°rios com embeddings por imagem
        study_ids_path: caminho para arquivo com IDs dos estudos
        patient_ids_path: caminho para arquivo com IDs dos pacientes
        text_emb_path: caminho para arquivo com embeddings de texto
        labels_path: caminho para arquivo CSV com labels CheXpert
    
    Returns:
        Lista de dicion√°rios, um por estudo
    """
    import pandas as pd
    
    # Carregar dados alinhados por √≠ndice
    study_ids = load_embeddings(study_ids_path)
    patient_ids = load_embeddings(patient_ids_path)
    text_embeddings = load_embeddings(text_emb_path)
    
    # Carregar labels
    df_labels = pd.read_csv(labels_path)
    
    # Identificar colunas de labels (excluindo subject_id e study_id)
    label_columns = [col for col in df_labels.columns if col not in ['subject_id', 'study_id']]
    
    # Criar mapeamento study_id -> labels
    study_labels_dict = {}
    for _, row in df_labels.iterrows():
        study_id = 's' + str(int(row['study_id']))
        # Pegar labels onde o valor √© 1
        labels = [col for col in label_columns if row[col] == 1.0]
        study_labels_dict[study_id] = labels
    
    # Criar mapeamento study_id -> index
    study_id_to_index = {sid: idx for idx, sid in enumerate(study_ids)}
    
    # Agrupar imagens por study_id
    from collections import defaultdict
    studies_dict = defaultdict(lambda: {
        'files_images': [],
        'embedding_images': []
    })
    
    for img_data in emb_per_image:
        study_id = img_data['study_id']
        studies_dict[study_id]['files_images'].append(img_data['image_name'])
        studies_dict[study_id]['embedding_images'].append(img_data['embedding_image'])
        studies_dict[study_id]['patient_id'] = img_data['patient_id']
    
    # Construir lista final de estudos
    result = []
    for study_id, data in studies_dict.items():
        # Buscar o √≠ndice do estudo
        if study_id not in study_id_to_index:
            print(f"‚ö†Ô∏è Study ID {study_id} n√£o encontrado nos arquivos de embeddings")
            continue
            
        study_idx = study_id_to_index[study_id]
        
        study_dict = {
            "study_index": study_idx,
            "study_id": study_id,
            "patient_id": patient_ids[study_idx],
            "files_images": data['files_images'],
            "embedding_images": data['embedding_images'],
            "file_text": f"{study_id}.txt",
            "embedding_text": text_embeddings[study_idx],
            "study_labels": study_labels_dict.get(study_id, [])  # Labels do estudo ou lista vazia
        }
        result.append(study_dict)
    
    print(f"‚úÖ {len(result)} estudos processados")
    return result

In [36]:
studies_dataset = build_study_dataset(emb_per_image)
print(f"\n\nTotal de estudos: {len(studies_dataset)}")
print(f"Exemplo: {studies_dataset[0]}")

‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)
‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835,)
‚úÖ Embeddings carregados com sucesso!
üìä Formato dos dados: <class 'numpy.ndarray'>
üìä Shape: (227835, 1152)
‚úÖ 227835 estudos processados


Total de estudos: 227835
Exemplo: {'study_index': 0, 'study_id': 's50414267', 'patient_id': 'p10000032', 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg', '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'], 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
        0.06212181,  0.01484995], dtype=float32), array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
        0.05352064,  0.01552592], dtype=float32)], 'file_text': 's50414267.txt', 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
       -0.07638288, -0.01550421], dtype=float3

In [37]:
studies_dataset[0]

{'study_index': 0,
 'study_id': 's50414267',
 'patient_id': 'p10000032',
 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
  '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'],
 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
          0.06212181,  0.01484995], dtype=float32),
  array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
          0.05352064,  0.01552592], dtype=float32)],
 'file_text': 's50414267.txt',
 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
        -0.07638288, -0.01550421], dtype=float32),
 'study_labels': ['No Finding']}

In [11]:
import random

# Criar amostra de 1% do studies_dataset para valida√ß√£o

# Definir seed para reprodutibilidade
random.seed(42)

# Calcular tamanho da amostra (1%)
sample_size = int(0.01 * len(studies_dataset))
print(f"Tamanho da amostra de valida√ß√£o: {sample_size}")

# Criar amostra aleat√≥ria
validation_dataset = random.sample(studies_dataset, sample_size)

print(f"Dataset completo: {len(studies_dataset)} estudos")
print(f"Dataset de valida√ß√£o: {len(validation_dataset)} estudos")
print(f"Primeiro item da valida√ß√£o: {validation_dataset[0]['study_id']}")

Tamanho da amostra de valida√ß√£o: 2278
Dataset completo: 227835 estudos
Dataset de valida√ß√£o: 2278 estudos
Primeiro item da valida√ß√£o: s57812169


In [12]:
df_labels = pd.read_csv("../dados/mimic/mimic-cxr-2.0.0-chexpert.csv")
df_labels.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [13]:
df_labels.shape

(227827, 16)

In [14]:
labels = df_labels.columns[2:]  # Excluir subject_id e study_id
len(labels)

14

In [15]:
# Contar quantos estudos t√™m cada label como positiva (valor 1)
label_counts = {}

for label in labels:
    count = (df_labels[label] == 1.0).sum()
    label_counts[label] = count

# Criar DataFrame para visualizar melhor
df_counts = pd.DataFrame(list(label_counts.items()), columns=['Label', 'Count'])
df_counts['Percentage'] = (df_counts['Count'] / len(df_labels)) * 100
df_counts = df_counts.sort_values('Count', ascending=False)

print("Contagem de labels positivas (valor = 1):")
print(df_counts)
print(f"\nTotal de estudos no dataset: {len(df_labels)}")

Contagem de labels positivas (valor = 1):
                         Label  Count  Percentage
8                   No Finding  75455   33.119428
13             Support Devices  66558   29.214272
9             Pleural Effusion  54300   23.833874
7                 Lung Opacity  51525   22.615844
0                  Atelectasis  45808   20.106484
1                 Cardiomegaly  44845   19.683795
3                        Edema  27018   11.858998
11                   Pneumonia  16556    7.266917
2                Consolidation  10778    4.730783
12                Pneumothorax  10358    4.546432
4   Enlarged Cardiomediastinum   7179    3.151075
6                  Lung Lesion   6284    2.758233
5                     Fracture   4390    1.926901
10               Pleural Other   2011    0.882687

Total de estudos no dataset: 227827


## dataset de valida√ß√£o com sklearn multiclass

In [17]:
from skmultilearn.model_selection import iterative_train_test_split

In [18]:
x = df_labels['study_id']
y = df_labels.drop(['subject_id', 'study_id'], axis=1)

for i in y.columns.to_list():
    y[i] = np.where(y[i] == 1, 1, 0)
    y[i] = y[i].astype('int64')

In [23]:
x_array = x.values.reshape(-1, 1)  # Shape: (n_samples, 1)
y_array = y.values                 # Shape: (n_samples, n_labels)

In [24]:
X_train, y_train, X_test, y_test = iterative_train_test_split(x_array, y_array, test_size = 0.01)

In [26]:
print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Labels shape: {y_train.shape}")

Train set: 225548 samples
Test set: 2279 samples
Labels shape: (225548, 14)


In [28]:
df_labels_val = df_labels[df_labels['study_id'].isin(X_test.flatten())]
df_labels_val.shape

(2279, 16)

In [29]:
# Calcular contagens e propor√ß√µes para o dataset original (df_labels)
print("=== COMPARA√á√ÉO: Dataset Original vs Dataset de Valida√ß√£o ===\n")

original_counts = {}
val_counts = {}

for label in labels:
    # Contagens dataset original
    original_count = (df_labels[label] == 1).sum()
    original_pct = (original_count / len(df_labels)) * 100
    original_counts[label] = {'count': original_count, 'percentage': original_pct}
    
    # Contagens dataset valida√ß√£o
    val_count = (df_labels_val[label] == 1).sum()
    val_pct = (val_count / len(df_labels_val)) * 100
    val_counts[label] = {'count': val_count, 'percentage': val_pct}

# Criar DataFrame comparativo
comparison_data = []
for label in labels:
    comparison_data.append({
        'Label': label,
        'Original_Count': original_counts[label]['count'],
        'Original_Pct': original_counts[label]['percentage'],
        'Validation_Count': val_counts[label]['count'],
        'Validation_Pct': val_counts[label]['percentage'],
        'Diff_Pct': val_counts[label]['percentage'] - original_counts[label]['percentage']
    })

df_comparison = pd.DataFrame(comparison_data)

print(f"Dataset Original: {len(df_labels):,} estudos")
print(f"Dataset Valida√ß√£o: {len(df_labels_val):,} estudos")
print(f"Propor√ß√£o da valida√ß√£o: {(len(df_labels_val)/len(df_labels))*100:.2f}%\n")

print("Distribui√ß√£o por Labels:")
print("-" * 80)
for _, row in df_comparison.iterrows():
    print(f"{row['Label']:25} | Original: {row['Original_Count']:6,} ({row['Original_Pct']:5.1f}%) | Valida√ß√£o: {row['Validation_Count']:4,} ({row['Validation_Pct']:5.1f}%) | Diff: {row['Diff_Pct']:+5.1f}%")

print(f"\n{'Total de labels positivas:':25} | Original: {df_comparison['Original_Count'].sum():6,} | Valida√ß√£o: {df_comparison['Validation_Count'].sum():4,}")

=== COMPARA√á√ÉO: Dataset Original vs Dataset de Valida√ß√£o ===

Dataset Original: 227,827 estudos
Dataset Valida√ß√£o: 2,279 estudos
Propor√ß√£o da valida√ß√£o: 1.00%

Distribui√ß√£o por Labels:
--------------------------------------------------------------------------------
Atelectasis               | Original: 45,808 ( 20.1%) | Valida√ß√£o:  458 ( 20.1%) | Diff:  -0.0%
Cardiomegaly              | Original: 44,845 ( 19.7%) | Valida√ß√£o:  448 ( 19.7%) | Diff:  -0.0%
Consolidation             | Original: 10,778 (  4.7%) | Valida√ß√£o:  114 (  5.0%) | Diff:  +0.3%
Edema                     | Original: 27,018 ( 11.9%) | Valida√ß√£o:  270 ( 11.8%) | Diff:  -0.0%
Enlarged Cardiomediastinum | Original:  7,179 (  3.2%) | Valida√ß√£o:   78 (  3.4%) | Diff:  +0.3%
Fracture                  | Original:  4,390 (  1.9%) | Valida√ß√£o:   44 (  1.9%) | Diff:  +0.0%
Lung Lesion               | Original:  6,284 (  2.8%) | Valida√ß√£o:   63 (  2.8%) | Diff:  +0.0%
Lung Opacity              | Origina

In [38]:
# Filtrar studies_dataset para incluir apenas estudos que est√£o em df_labels_val
balanced_validation_dataset = []

# Converter df_labels_val['study_id'] para um conjunto de strings no formato correto
val_study_ids = set('s' + str(int(study_id)) for study_id in df_labels_val['study_id'])

# Filtrar studies_dataset
for study in studies_dataset:
    if study['study_id'] in val_study_ids:
        balanced_validation_dataset.append(study)

print(f"‚úÖ Dataset de valida√ß√£o do sklearn criado: {len(balanced_validation_dataset)} estudos")
print(f"Dataset original (sklearn split): {len(df_labels_val)} estudos")
print(f"Estudos encontrados no studies_dataset: {len(balanced_validation_dataset)}")

# Verificar se todos os estudos foram encontrados
missing_studies = len(df_labels_val) - len(balanced_validation_dataset)
if missing_studies > 0:
    print(f"‚ö†Ô∏è {missing_studies} estudos n√£o foram encontrados no studies_dataset")

‚úÖ Dataset de valida√ß√£o do sklearn criado: 2279 estudos
Dataset original (sklearn split): 2279 estudos
Estudos encontrados no studies_dataset: 2279


In [39]:
import os

# Salvar balanced_validation_dataset em artifacts/datasets

# Criar diret√≥rio se n√£o existir
os.makedirs("artifacts/datasets", exist_ok=True)

# Salvar o dataset de valida√ß√£o balanceado
np.save("artifacts/datasets/balanced_validation_dataset.npy", balanced_validation_dataset)

print(f"‚úÖ Dataset de valida√ß√£o balanceado salvo em 'artifacts/datasets/balanced_validation_dataset.npy'")
print(f"Total de estudos salvos: {len(balanced_validation_dataset)}")

‚úÖ Dataset de valida√ß√£o balanceado salvo em 'artifacts/datasets/balanced_validation_dataset.npy'
Total de estudos salvos: 2279


In [40]:
balanced_validation_dataset[0]

{'study_index': 0,
 'study_id': 's50414267',
 'patient_id': 'p10000032',
 'files_images': ['02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
  '174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg'],
 'embedding_images': [array([-0.02441736,  0.03465954,  0.00766411, ..., -0.01386673,
          0.06212181,  0.01484995], dtype=float32),
  array([-0.03161281,  0.03163843,  0.00890976, ..., -0.01544448,
          0.05352064,  0.01552592], dtype=float32)],
 'file_text': 's50414267.txt',
 'embedding_text': array([-0.01413504,  0.01464215, -0.02659141, ..., -0.03464624,
        -0.07638288, -0.01550421], dtype=float32),
 'study_labels': ['No Finding']}

In [41]:
balanced_validation_dataset[1]

{'study_index': 2,
 'study_id': 's53911762',
 'patient_id': 'p10000032',
 'files_images': ['68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg',
  'fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818.jpg'],
 'embedding_images': [array([-0.02708424,  0.03832414,  0.00759004, ..., -0.01247607,
          0.04935698,  0.01244616], dtype=float32),
  array([-0.02862299,  0.03022767,  0.00633971, ..., -0.01368767,
          0.04439005,  0.00473679], dtype=float32)],
 'file_text': 's53911762.txt',
 'embedding_text': array([ 0.00691437,  0.02525791, -0.01365044, ..., -0.04275578,
        -0.04552101, -0.01475061], dtype=float32),
 'study_labels': ['No Finding']}