In [2]:
import sys ,os, importlib
p = os.path.abspath('../')
if p not in sys.path:
    sys.path.append(p)

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydicom
import configparser
import argparse
from pathlib import Path

from tqdm.notebook import tqdm
from skimage.draw import polygon
import pydicom
from dpipe.io import save
from dicom_csv import (expand_volumetric, drop_duplicated_instances, 
                       drop_duplicated_slices, order_series, stack_images, 
                       get_slice_locations, get_pixel_spacing, get_tag, join_tree)

import utils as Utils
import data_processing.data_processing_functions as DP

importlib.reload(Utils)
importlib.reload(DP)

config = configparser.ConfigParser()
config.read("../config.ini")

pd.set_option('display.max_colwidth', None)

print(sys.executable)
print(sys.version)
print(sys.version_info)

/media/my_ftp/TFTs/amoure/TFM_MUIT/tfm-muit-venv/bin/python
3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)


In [3]:
TARGET_SIZE = (512, 512)

PROCESSED_DATA_PATH = config["PATHS"]["PROCESSED_DATA_PATH"]
SEGMENTATION_PROCESSED_DATA_PATH = os.path.join(PROCESSED_DATA_PATH, "segmentation")
METADATA_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv")
PROCESSED_RGB_IMAGES_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "rgb_images")
PROCESSED_IMAGES_FULL_RES_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "full_res_images")
PROCESSED_LUNG_MASKS_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "masks")
PROCESSED_INFECTION_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "infection")
POOL_RESULTS_PATH = os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "pool_results")

os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(SEGMENTATION_PROCESSED_DATA_PATH, exist_ok=True)
os.makedirs(PROCESSED_RGB_IMAGES_PATH, exist_ok=True)
os.makedirs(POOL_RESULTS_PATH, exist_ok=True)
#os.makedirs(PROCESSED_IMAGES_FULL_RES_PATH, exist_ok=True)
os.makedirs(PROCESSED_LUNG_MASKS_PATH, exist_ok=True)
os.makedirs(PROCESSED_INFECTION_PATH, exist_ok=True)

print(SEGMENTATION_PROCESSED_DATA_PATH)
print(PROCESSED_IMAGES_FULL_RES_PATH)
print(PROCESSED_RGB_IMAGES_PATH)
print(PROCESSED_LUNG_MASKS_PATH)
print(PROCESSED_INFECTION_PATH)
print(METADATA_PATH)

/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation
/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation/full_res_images
/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation/rgb_images
/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation/masks
/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation/infection
/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/processed/segmentation/metadata.csv


# RGB Images

## COVID_SEG_1
http://medicalsegmentation.com/covid19/

**COVID_SEG_1 has right and left lung masks, so we need to convert that to a single full lung mask**

In [3]:
COVID_SEG_1_PATH = config['PATHS']['COVID_SEG_1_PATH']
COVID_SEG_1_TRAINIG_IMAGES_PATH = os.path.join(COVID_SEG_1_PATH, "tr_im.nii")
COVID_SEG_1_TRAINING_INFECTION_MASKS_PATH = os.path.join(COVID_SEG_1_PATH, "tr_mask.nii")
COVID_SEG_1_TRAINING_LUNG_MASKS_PATH = os.path.join(COVID_SEG_1_PATH, "tr_lungmasks_updated.nii")
COVID_SEG_1_METADATA_PATH = os.path.join(COVID_SEG_1_PATH, "Test-Images-Clinical-Details.csv")

In [4]:
training_images_nii = Utils.read_nii(COVID_SEG_1_TRAINIG_IMAGES_PATH)
training_infection_masks_nii = Utils.read_nii(COVID_SEG_1_TRAINING_INFECTION_MASKS_PATH)
training_lung_masks_nii = Utils.read_nii(COVID_SEG_1_TRAINING_LUNG_MASKS_PATH)
print(training_images_nii.shape)
print(training_lung_masks_nii.shape)
print(training_infection_masks_nii.shape)

(512, 512, 100)
(512, 512, 100)
(512, 512, 100)


In [5]:
metadata_list = []

for scan_index, slice_index in tqdm(enumerate(range(training_images_nii.shape[2])), total = len(range(training_images_nii.shape[2]))):
    ct = training_images_nii[:,:,slice_index]
    infection_mask = training_infection_masks_nii[:,:,slice_index]
    lung_mask = training_lung_masks_nii[:,:,slice_index]

    processed_ct_image = Utils.to_rgb(ct, norm=True, range_255=True, clip=True)
    processed_infection_mask = Utils.to_uint8(DP.generate_covid_seg_1_infection_mask(infection_mask))
    processed_lung_mask = Utils.to_uint8(DP.generate_covid_seg_1_lung_mask(lung_mask, rgb=True))

    has_infection = Utils.has_true_label(processed_infection_mask)
    has_lung_mask = Utils.has_true_label(processed_lung_mask)
    if has_infection:
        infection_str = "w_infection"
    else:
        infection_str = "no_infection"

    ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"covid_seg_1_{scan_index}_{slice_index}_covid_{infection_str}.png")
    infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"covid_seg_1_{scan_index}_{slice_index}_covid_{infection_str}.png")
    lung_mask_path = os.path.join(PROCESSED_LUNG_MASKS_PATH, f"covid_seg_1_{scan_index}_{slice_index}_covid_{infection_str}.png")

    ct_width = processed_ct_image.shape[0]
    ct_height = processed_ct_image.shape[1]

    if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
        processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
        processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)
        processed_lung_mask = cv2.resize(processed_lung_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)

    assert cv2.imwrite(ct_slice_path, processed_ct_image,[cv2.IMWRITE_PNG_COMPRESSION, 0])
    assert cv2.imwrite(infection_mask_path, processed_infection_mask,[cv2.IMWRITE_PNG_COMPRESSION, 0])
    assert cv2.imwrite(lung_mask_path, processed_lung_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
    
    metadata_list.append(
        ("COVID_SEG_1",
        "covid",
        scan_index,
        slice_index,
        ct_slice_path,
        infection_mask_path,
        lung_mask_path,
        processed_ct_image.shape[0],
        processed_ct_image.shape[1],
        has_infection,
        has_lung_mask
        )
    )

metadata_df = pd.DataFrame(
    metadata_list,
      columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path","infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"]
    )
metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), mode='w', index=False, sep=";")

  0%|          | 0/100 [00:00<?, ?it/s]

## COVID_SEG_2
http://medicalsegmentation.com/covid19/

In [5]:
COVID_SEG_2_PATH = config['PATHS']['COVID_SEG_2_PATH']
COVID_SEG_2_TRAINIG_IMAGES_PATH = os.path.join(COVID_SEG_2_PATH, "rp_im")
COVID_SEG_2_TRAINING_INFECTION_MASKS_PATH = os.path.join(COVID_SEG_2_PATH, "rp_msk")
COVID_SEG_2_TRAINING_LUNG_MASKS_PATH = os.path.join(COVID_SEG_2_PATH, "rp_lung_msk")

In [6]:
covid_seg_2_metadata_df = DP.read_covid_seg_2_folder_structure(COVID_SEG_2_PATH)
metadata_list = []

for scan_index in tqdm(range(covid_seg_2_metadata_df.shape[0]), total = covid_seg_2_metadata_df.shape[0]):

    ct_nii = Utils.read_nii(covid_seg_2_metadata_df["image"][scan_index])
    lung_masks_nii = Utils.read_nii(covid_seg_2_metadata_df["lung_mask"][scan_index])
    infection_masks_nii = Utils.read_nii(covid_seg_2_metadata_df["infection_mask"][scan_index])
    
    for slice_index in range(ct_nii.shape[2]):
        ct_slice = ct_nii[:,:,slice_index]
        lung_mask = lung_masks_nii[:,:,slice_index]
        infection_mask = infection_masks_nii[:,:,slice_index]

        processed_ct_image = Utils.to_rgb(ct_slice, norm=True, range_255=True, clip=True)
        processed_infection_mask = Utils.to_uint8(infection_mask)
        processed_lung_mask = Utils.to_uint8(DP.generate_covid_seg_2_lung_mask(lung_mask, rgb=True))

        has_infection = Utils.has_true_label(processed_infection_mask)
        has_lung_mask = Utils.has_true_label(processed_lung_mask)
        
        if has_infection:
            infection_str = "w_infection"
        else:
            infection_str = "no_infection"
        
        ct_width = processed_ct_image.shape[0]
        ct_height = processed_ct_image.shape[1]

        if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
            processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_lung_mask = cv2.resize(processed_lung_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)
        
        ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"covid_seg_2_{scan_index}_{slice_index}_covid_{infection_str}.png")
        lung_mask_path = os.path.join(PROCESSED_LUNG_MASKS_PATH, f"covid_seg_2_{scan_index}_{slice_index}_covid_{infection_str}.png")
        infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"covid_seg_2_{scan_index}_{slice_index}_covid_{infection_str}.png")
        
        assert cv2.imwrite(ct_slice_path, processed_ct_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(infection_mask_path, processed_infection_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(lung_mask_path, processed_lung_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])

        metadata_list.append(
            ("COVID_SEG_2",
            "covid",
            scan_index,
            slice_index,
            ct_slice_path,
            infection_mask_path,
            lung_mask_path,
            processed_ct_image.shape[0],
            processed_ct_image.shape[1],
            has_infection,
            has_lung_mask
            )
        )
        
# metadata_df = pd.DataFrame(
#     metadata_list,
#       columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path","infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"]
#     )
# metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), mode='a', index=False, sep=";", header=False)

  0%|          | 0/9 [00:00<?, ?it/s]

## COVID_CT_JUNMA
https://zenodo.org/record/3757476#.ZClbFy8RphF

In [7]:
COVID_CT_JunMa_PATH = config['PATHS']['COVID_CT_JunMa_PATH']
covid_ct_junma_metadata_df = DP.read_covid_ct_junma_folder_structure(COVID_CT_JunMa_PATH)
covid_ct_junma_metadata_df.shape

(19, 4)

In [7]:
metadata_list = []
for scan_index in tqdm(range(covid_ct_junma_metadata_df.shape[0]), total = covid_ct_junma_metadata_df.shape[0]):
    ct_nii = Utils.read_nii(covid_ct_junma_metadata_df["image"][scan_index])
    lung_masks_nii = Utils.read_nii(covid_ct_junma_metadata_df["lung_mask"][scan_index])
    infection_masks_nii = Utils.read_nii(covid_ct_junma_metadata_df["infection_mask"][scan_index])
    
    for slice_index in range(ct_nii.shape[2]):
        ct_slice = ct_nii[:,:,slice_index]
        lung_mask = lung_masks_nii[:,:,slice_index]
        infection_mask = infection_masks_nii[:,:,slice_index]

        processed_ct_image = Utils.to_rgb(ct_slice, norm=True, range_255=True, clip=True)
        processed_infection_mask = Utils.to_uint8(infection_mask) # Only o or 1 values
        processed_lung_mask = Utils.to_uint8(DP.generate_covid_ct_junma_lung_mask(lung_mask, rgb=True))

        has_infection = Utils.has_true_label(processed_infection_mask)
        has_lung_mask = Utils.has_true_label(processed_lung_mask)
        
        if has_infection:
            infection_str = "w_infection"
        else:
            infection_str = "no_infection"
        
        ct_width = processed_ct_image.shape[0]
        ct_height = processed_ct_image.shape[1]

        if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
            processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_lung_mask = cv2.resize(processed_lung_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)

        ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"covid_ct_junma_{scan_index}_{slice_index}_covid_{infection_str}.png")
        lung_mask_path = os.path.join(PROCESSED_LUNG_MASKS_PATH,  f"covid_ct_junma_{scan_index}_{slice_index}_covid_{infection_str}.png")
        infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"covid_ct_junma_{scan_index}_{slice_index}_covid_{infection_str}.png")
        
        assert cv2.imwrite(ct_slice_path, processed_ct_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(infection_mask_path, processed_infection_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(lung_mask_path, processed_lung_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        
        metadata_list.append(
        ("COVID_CT_JunMa",
        "covid",
        scan_index,
        slice_index,
        ct_slice_path,
        infection_mask_path,
        lung_mask_path,
        processed_ct_image.shape[0],
        processed_ct_image.shape[1],
        has_infection,
        has_lung_mask
        )
    )
        
# metadata_df = pd.DataFrame(
#     metadata_list,
#       columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path","infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"]
#     )
# metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), mode='a', index=False, sep=";", header=False)

  0%|          | 0/19 [00:00<?, ?it/s]

## MIDRC RICORD 1A

In [2]:
MIDRC_RICORD_1A_PATH = config["PATHS"]["MIDRC_RICORD_1A_PATH"]
PROCESSED_MIDRC_RICORD_1A_PATH = PROCESSED_DATA_PATH + "/MIDRC_RICORD_1A_npy/"
PROCESSED_MIDRC_METADATA_PATH = PROCESSED_MIDRC_RICORD_1A_PATH + "expanded_meta.csv"
# Comment if already crawled and generated .npy files
#create_midrc_dataset(MIDRC_RICORD_1A_PATH, PROCESSED_MIDRC_RICORD_1A_PATH, PROCESSED_MIDRC_RICORD_1A_PATH + "/joined.csv")
meta_df = pd.read_csv(PROCESSED_MIDRC_METADATA_PATH)
print(meta_df.shape)

NameError: name 'config' is not defined

In [11]:
metadata_list = []
for scan_index in tqdm(range(len(meta_df)), total=len(meta_df)):
    ct_path = meta_df.iloc[scan_index]['CT']
    mask_path = meta_df.iloc[scan_index]['mask']
    ct = np.load(ct_path)
    mask = np.load(mask_path)
    for slice_index in range(ct.shape[2]):
        ct_slice = ct[:, :, slice_index]
        infection_mask_slice = mask[:,:,slice_index]

        processed_ct_image = Utils.to_rgb(ct_slice, norm=True, range_255=True, clip=True)
        processed_infection_mask = Utils.to_uint8(infection_mask_slice)

        has_infection = Utils.has_true_label(processed_infection_mask)
        has_lung_mask = False
        lung_mask_path = ""
        
        if has_infection:
            infection_str = "w_infection"
        else:
            infection_str = "no_infection"
        
        ct_width = processed_ct_image.shape[0]
        ct_height = processed_ct_image.shape[1]

        if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
            processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)

        ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"midrc_ricord_1a_{scan_index}_{slice_index}_covid_{infection_str}.png")
        infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"midrc_ricord_1a_{scan_index}_{slice_index}_covid_{infection_str}.png")

        assert cv2.imwrite(ct_slice_path, processed_ct_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(infection_mask_path, processed_infection_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])

        metadata_list.append(
        ("MIDRC_RICORD_1A",
        "covid",
        scan_index,
        slice_index,
        ct_slice_path,
        infection_mask_path,
        lung_mask_path,
        processed_ct_image.shape[0],
        processed_ct_image.shape[1],
        has_infection,
        has_lung_mask
        )
    )

metadata_df = pd.DataFrame(
    metadata_list,
      columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path","infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"]
    )
metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), index=False, mode = "a", sep = ";", header=False)

  0%|          | 0/112 [00:00<?, ?it/s]

## MOSMED

In [12]:
MOSMED_PATH = config["PATHS"]["MOSMED_PATH"]
print(MOSMED_PATH)
mosmed_df = DP.read_mosmed_folder_structure(MOSMED_PATH)
print(mosmed_df.shape)

/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/MOSMED
(1110, 4)


In [13]:
metadata_list = []
for scan_index in tqdm(range(len(mosmed_df)), total=len(mosmed_df)):
    ct_path = mosmed_df.iloc[scan_index]['study_file']
    mask_path = mosmed_df.iloc[scan_index]['mask_file']
    
    ct_nii = Utils.read_nii(ct_path)

    if(mask_path != ""):
        infection_mask_nii = Utils.read_nii(mask_path)
        has_mask = True
    else:
        infection_mask_nii = np.zeros(ct_nii.shape)
        has_mask = False

    for slice_index in range(ct_nii.shape[2]):
        ct_slice = ct_nii[:,:,slice_index]
        processed_ct_image = Utils.to_rgb(ct_slice, norm=True, range_255=True, clip=True)
        processed_infection_mask = Utils.to_uint8(infection_mask_nii[:,:,slice_index])
        has_lung_mask = False
        lung_mask_path = ""
        
        if has_mask:
            has_infection = Utils.has_true_label(processed_infection_mask)
        else:
            has_infection = False
        if has_infection:
            infection_str = "w_infection"
        else:
            infection_str = "no_infection"
        
        ct_width = processed_ct_image.shape[0]
        ct_height = processed_ct_image.shape[1]

        if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
            processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)

        ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"mosmed_{scan_index}_{slice_index}_covid_{infection_str}.png")
        infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"mosmed_{scan_index}_{slice_index}_covid_{infection_str}.png")

        assert cv2.imwrite(ct_slice_path, processed_ct_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])
        assert cv2.imwrite(infection_mask_path, processed_infection_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
    
        metadata_list.append(
        ("MOSMED",
        "covid",
        scan_index,
        slice_index,
        ct_slice_path,
        infection_mask_path,
        lung_mask_path,
        processed_ct_image.shape[0],
        processed_ct_image.shape[1],
        has_infection,
        has_lung_mask
        )
    )

metadata_df = pd.DataFrame(
    metadata_list,
      columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path", "infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"]
    )
metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), index=False, mode = "a", sep = ";", header=False)

  0%|          | 0/1110 [00:00<?, ?it/s]

## CNCB

We are asuming the 350 CTs that have been manually segmented by radiologists are not present in the rest of the CNCB subset 
-   TODO: Verify if this asumption maintains

In [8]:
cncb_segmentation_df = DP.read_cnbc_folder_structure(os.path.join(config["PATHS"]["CNCB_COVID_CT_PATH"], "ct_lesion_seg"))
cncb_segmentation_df = cncb_segmentation_df.sort_values(["scan_index", "slice_index"], ascending=[True, True])
print(cncb_segmentation_df.shape)
cncb_segmentation_df.head()

(21470, 6)


Unnamed: 0,scan_index,slice_index,image_path,n_slices,mask_path,n_masks
5427,0,0,/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/CNCB_COVID_CT/ct_lesion_seg/image/0/0.jpg,138,,
5475,0,1,/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/CNCB_COVID_CT/ct_lesion_seg/image/0/1.jpg,138,,
5483,0,2,/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/CNCB_COVID_CT/ct_lesion_seg/image/0/2.jpg,138,,
5534,0,3,/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/CNCB_COVID_CT/ct_lesion_seg/image/0/3.jpg,138,,
5490,0,4,/media/my_ftp/BasesDeDatos_Torax_RX_CT/COVID19_CT/CNCB_COVID_CT/ct_lesion_seg/image/0/4.jpg,138,,


In [13]:
metadata_list = []
for scan_index in tqdm(cncb_segmentation_df["scan_index"].unique(), total=len(cncb_segmentation_df["scan_index"].unique())):
    slices_df = cncb_segmentation_df[cncb_segmentation_df["scan_index"] == scan_index]
    for _ , slice_row in slices_df.iterrows():
        processed_ct_image = cv2.imread(slice_row["image_path"], cv2.IMREAD_UNCHANGED)
        slice_index = slice_row["slice_index"]
        infection_mask_path = slice_row["mask_path"]

        if type(infection_mask_path) != float:      
            infection_mask = cv2.imread(infection_mask_path, cv2.IMREAD_UNCHANGED)   
            processed_infection_mask = DP.generate_cnbc_infection_mask(infection_mask)
            has_infection = Utils.has_true_label(processed_infection_mask) # added this to filter only slices that really contain an infection after filtering out 1 - lung field masks
        else:
            has_infection = False
        has_lung_mask = False
        lung_mask_path = ""
        if has_infection:
            infection_str = "w_infection"
        else:
            infection_str = "no_infection"
        
        ct_width = processed_ct_image.shape[0]
        ct_height = processed_ct_image.shape[1]

        if ct_width != TARGET_SIZE[0] or ct_height != TARGET_SIZE[1]:
            processed_ct_image = cv2.resize(processed_ct_image, TARGET_SIZE, interpolation=cv2.INTER_AREA)
            processed_infection_mask = cv2.resize(processed_infection_mask, TARGET_SIZE, interpolation=cv2.INTER_AREA)

        ct_slice_path = os.path.join(PROCESSED_RGB_IMAGES_PATH, f"cncb_{scan_index}_{slice_index}_covid_{infection_str}.png")
        assert cv2.imwrite(ct_slice_path, processed_ct_image, [cv2.IMWRITE_PNG_COMPRESSION, 0])

        if has_infection:
            # This is the only dataset that may not contain infection masks for some slices
            infection_mask_path = os.path.join(PROCESSED_INFECTION_PATH, f"cncb_{scan_index}_{slice_index}_covid_{infection_str}.png")
            assert cv2.imwrite(infection_mask_path, processed_infection_mask, [cv2.IMWRITE_PNG_COMPRESSION, 0])
            
        metadata_list.append(
        ("CNCB",
        "covid",
        scan_index,
        slice_index,
        ct_slice_path,
        infection_mask_path,
        lung_mask_path,
        processed_ct_image.shape[0],
        processed_ct_image.shape[1],
        has_infection,
        has_lung_mask
        )
)
        
metadata_df = pd.DataFrame(
    metadata_list,
      columns=["dataset", "label", "scan_index", "slice_index", "ct_slice_path", "infection_mask_path", "lung_mask_path", "height", "width", "has_infection", "has_lung_mask"])
metadata_df.to_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), index=False, mode = "a", sep = ";", header=False)

  0%|          | 0/150 [00:00<?, ?it/s]

# Metadata checks

In [4]:
metadata_to_test_df = pd.read_csv(os.path.join(SEGMENTATION_PROCESSED_DATA_PATH, "metadata.csv"), sep=";", low_memory=False)
metadata_to_test_df["dataset"].value_counts()

dataset
MOSMED             46411
CNCB               21470
MIDRC_RICORD_1A    15887
COVID_CT_JunMa      3102
COVID_SEG_2          829
COVID_SEG_1          100
Name: count, dtype: int64