In [2]:
from src.utils.consts import CLEARED_DATASET
from src.utils.consts import TF_RECORD_DATASET

# Packages Imports
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from PIL import Image

# Types
from typing import Dict, List

In [3]:
# Load Data
data_entry_path = f"{CLEARED_DATASET}/Data_Entry_2017.csv"
data_entry_df = pd.read_csv(data_entry_path, delimiter=',')

bounding_box_path = f"{CLEARED_DATASET}/BBox_List_2017.csv"
bounding_box_df = pd.read_csv(bounding_box_path, delimiter=',')

images_path = f"{CLEARED_DATASET}/images"

In [4]:
def build_labels_map(data_entry_df: pd.DataFrame) -> pd.DataFrame:
    """
    Build a mapping of unique labels to indices from a DataFrame containing medical findings.
    """
    unique_labels = (data_entry_df["Finding Labels"].str.split("|").explode().dropna().unique())
    label_to_index: Dict[str, int] = {label: idx for idx, label in enumerate(sorted(unique_labels))}
    return pd.DataFrame(list(label_to_index.items()), columns=["Label", "Index"])

In [5]:
# Store Labels Mapping
from src.data.labels_utils import get_labels_dict

label_df = build_labels_map(data_entry_df)
label_df.to_csv(f"{TF_RECORD_DATASET}/label_mappings.csv", index=False)

labels_dict = get_labels_dict(f"{TF_RECORD_DATASET}/label_mappings.csv")
print(labels_dict)

{'Atelectasis': 0, 'Cardiomegaly': 1, 'Consolidation': 2, 'Edema': 3, 'Effusion': 4, 'Emphysema': 5, 'Fibrosis': 6, 'Hernia': 7, 'Infiltration': 8, 'Mass': 9, 'No Finding': 10, 'Nodule': 11, 'Pleural_Thickening': 12, 'Pneumonia': 13, 'Pneumothorax': 14}


In [6]:
from src.data.data_entry_utils import extract_patient_info, extract_image_info
from src.data.bounding_box_utils import extract_bbox_data
from src.data.image_utils import get_image_bytes
from src.data.labels_utils import encode_labels

from src.model.tensorflow_utils import to_bytes_feature, to_int64_feature, to_float_feature

def create_tf_example(image_path, data_entry_df, bounding_boxes_df):
    image_index = os.path.basename(image_path)
    row = data_entry_df[data_entry_df["Image Index"] == image_index]
    
    if row.empty:
        print(f"Skipping: {image_index} not found in DataEntry.csv")
        return None

    image_bytes = get_image_bytes(image_path)
    if image_bytes is None:
        print(f"Skipping missing file: {image_path}")
        return None

    # Patient Data
    patient_info = extract_patient_info(row.iloc[0])

    # Image Data
    image_info = extract_image_info(row.iloc[0])
    finding_labels = image_info["finding_labels"]
    encoded_labels = encode_labels(finding_labels, labels_dict)

    # Bounding Box Data
    bbox_data = extract_bbox_data(bounding_boxes_df, image_index)
    bbox_data_label = bbox_data.get("bbox_finding_label", "")
    encoded_bbox_data_label = encode_labels(bbox_data_label, labels_dict)
    
    feature = {
        "image": to_bytes_feature(image_bytes),
        "image_index": to_bytes_feature(image_index.encode()),
        "encoded_finding_labels": to_int64_feature(encoded_labels),
        "finding_labels_array": tf.train.Feature(bytes_list=tf.train.BytesList(value=[l.encode() for l in finding_labels])),
        "encoded_bbox_finding_label": to_int64_feature(encoded_bbox_data_label),
        "bbox_finding_label": to_bytes_feature(bbox_data_label.encode()),
        "x": to_float_feature(bbox_data.get("x", []) or []),
        "y": to_float_feature(bbox_data.get("y", []) or []),
        "w": to_float_feature(bbox_data.get("w", []) or []),
        "h": to_float_feature(bbox_data.get("h", []) or []),
        "patient_id": to_int64_feature([patient_info["patient_id"]]),
        "patient_age": to_int64_feature([patient_info["patient_age"]]),
        "patient_gender": to_bytes_feature(patient_info["patient_gender"].encode()),
        "view_position": to_bytes_feature(image_info["view_position"].encode()),
        "image_width": to_float_feature([image_info["image_width"]]),
        "image_height": to_float_feature([image_info["image_height"]]),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [7]:
image_files = set()
for image in os.listdir(images_path):
    image_files.add(os.path.join(images_path, image))

In [8]:
# Define TFRecord output file
tfrecord_filename = f"{TF_RECORD_DATASET}/chest_xray_data.tfrecord"

with tf.io.TFRecordWriter(tfrecord_filename) as writer:
    for image_path in tqdm(image_files):
        example = create_tf_example(image_path, data_entry_df, bounding_box_df)
        if example is not None:
            writer.write(example.SerializeToString())

print(f"TFRecord saved as {tfrecord_filename}")

100%|██████████████████████████████████| 102663/102663 [08:27<00:00, 202.33it/s]

TFRecord saved as /home/piotr/Pulpit/codebook/studies/bachelor-thesis/datasets/tfrecord-dataset/nih-dataset/chest_xray_data.tfrecord



