# Compile hemorrhage characteristics from BHSD dataset
### Creates and saves volume (mL) and attenuation (HU) distributions from BHSD datasets.

Download BHSD if not found in repository:

In [1]:
# download and extract BHSD data:
import os
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

if os.path.isdir('../datasets/BHSD/label_192/'):
    print('BHSD dataset already downloaded.')
else:
    url = 'https://huggingface.co/datasets/WuBiao/BHSD/resolve/main/label_192.zip'
    save_path = '../datasets/BHSD/'

    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=save_path)

BHSD dataset already downloaded.


### Create and save Pandas dataframe with BHSD hemorrhage characteristics:

In [4]:
from pathlib import *
import nibabel as nib
import numpy as np
import pandas as pd
import sys
import skimage as sk
import matplotlib.pyplot as plt

regenerate_spreadsheet = False
verbose = False # set to False to disable pesky things like printing file IDs and cluster numbers used for debugging

BHSD_path = Path('../datasets/BHSD/label_192/') # TODO: move to public location if necessary

if not regenerate_spreadsheet:
    try:
        df = pd.read_csv('../datasets/BHSD/BHSD_hemorrhage_characteristics.csv')
        print('Found BHSD spreadsheet')
    except:
        print('CSV not found, regenerating spreadsheet...')
        regenerate_spreadsheet = True

if regenerate_spreadsheet:
    # initialize empty dataframe
    df = pd.DataFrame(columns=['Dataset', 'Data_ID', 'Type', 'Volume_[mL]', 'Mean_HU', 'Median_HU', 'dx', 'dy', 'dz', 'num_slices', 'z_dist'])

    cluster_threshold = 15 # clusters an area less than this number (in voxels) are excluded, necessary because manual and automatic segmentation leave some spurious clusters of voxels

    labels = ["bkg", "EDH", "IPH", "IVH", "SAH", "SDH"] # 0, 1, 2, 3, 4, 5; labels used by BHSD dataset creators

    img_dir = BHSD_path / "images"
    truth_dir = BHSD_path / "ground truths"

    row_idx = 0

    label_names = sorted(os.listdir(img_dir))

    for idx, file in enumerate(label_names):
        if verbose: print(str(file))
        img = nib.load(img_dir / file)
        [dx, dy, dz] = img.header['pixdim'][1:4]
        image = img.get_fdata()

        mask = nib.load(truth_dir / file).get_fdata()

        for label_idx in range(1, len(labels)): # skip background

            hemorrhage_volume = (len(np.argwhere(mask == label_idx)))*((dx*dy*dz)/1000)

            if hemorrhage_volume != 0:
                hemorrhage_mask = np.where(mask == label_idx, 1, 0) # create new hemorrhage mask for corresponding lesion type

                label_mask, num = sk.measure.label(hemorrhage_mask, return_num=True, connectivity=1)
                if verbose: print('number of ' + str(labels[label_idx]) + ' clusters: ' + str(num))

                for cluster_idx in range(1,num+1):
                    cluster = np.where(label_mask == cluster_idx, 1, 0)
                    if verbose: print('cluster idx: '+str(cluster_idx)+', count: '+str(np.count_nonzero(cluster)))
                    if np.count_nonzero(cluster) > cluster_threshold:
                        num_slices = 0
                        for slice_idx in range(cluster.shape[2]):
                            slice = cluster[:, :, slice_idx]
                            if np.any(slice): # check if hemorrhage
                                num_slices += 1

                        hemorrhage_volume = (len(np.argwhere(cluster == 1)))*((dx*dy*dz)/1000)

                        z_dist = num_slices * dz

                        # calculate mean and median HU
                        lesion_only = np.multiply(image, cluster)
                        lesion_only[lesion_only < -500] = 0
                        mean_HU = np.mean(lesion_only[np.nonzero(cluster)])
                        median_HU = np.median(lesion_only[np.nonzero(cluster)])

                        # add to data frame and move on
                        df.loc[row_idx] = ['BHSD', str(file), labels[label_idx], hemorrhage_volume, mean_HU, median_HU, dx, dy, dz, num_slices, z_dist]
                        row_idx += 1

    df.to_csv('../datasets/BHSD/BHSD_hemorrhage_characteristics.csv')
    

Found BHSD spreadsheet
