In [1]:
import os
import pandas as pd
import numpy as np
import random
from utils import from_np_array
import json
from monai.data.box_utils import convert_box_mode

In [2]:
content = pd.read_csv(
    '/data/output/hc_clean_nodules3.csv',
    converters={
        'spacing': from_np_array,
        'origin': from_np_array
    }
)
print(content.shape)
content.head()

(101, 10)


Unnamed: 0,name,spacing,origin,exam_refined,x_min,y_min,z_min,x_max,y_max,z_max
0,PL032303325116347,"[1.25, 0.703125, 0.703125]","[-239.75, 121.796875, 173.59687805]",False,175,171,62,201,200,76
1,PL061850842690412,"[1.25, 0.703125, 0.703125]","[-241.5, 390.02319336, 255.0231781]",False,455,256,227,487,288,240
2,PL064775189851313,"[1.25, 0.703125, 0.703125]","[-13.5, 372.078125, 234.078125]",False,206,231,195,240,260,208
3,PL070153620449075,"[1.25, 0.703125, 0.703125]","[16.5, 369.02319336, 246.0231781]",False,445,330,222,514,388,247
4,PL075999918976403,"[1.25, 0.703125, 0.703125]","[356.99996948, 308.09777832, 218.09777832]",False,359,208,181,419,266,216


### based on bounding box found, finding the center and diameter of the nodules

### converting voxel to world coordinates
### reference: https://github.com/xjmxmt/LUNA16/blob/master/full_prep.py#L286

In [3]:
dataset = []
for hc_bb in content.itertuples(index=False):

    origin = np.array(list(reversed(hc_bb[2])))
    spacing = np.array(list(reversed(hc_bb[1])))

    voxel_min = np.array((hc_bb[4], hc_bb[5], hc_bb[6]))
    voxel_max = np.array((hc_bb[7], hc_bb[8], hc_bb[9]))

    voxel_coord_cent = voxel_min + (np.absolute(voxel_min - voxel_max)/2)
    
    voxel_coord_diam = voxel_max - voxel_min

    voxel_coord_cent[2] = voxel_coord_cent[2] * -1

    world_coord_cent = ((voxel_coord_cent * spacing) - origin ) * -1
    world_coord_diam = voxel_coord_diam * spacing
    
    dataset.append({
        "box": [[
                world_coord_cent[0],
                world_coord_cent[1],
                world_coord_cent[2],
                world_coord_diam[0],
                world_coord_diam[1],
                world_coord_diam[2]
            ]],
            "image": hc_bb[0] + '/' + hc_bb[0] + '.nii.gz',
            "label": [0]
    })

### data split 

In [4]:
random.shuffle(dataset)
print('total: ' + str(len(dataset)))
test_split_constant = 0.8
test_size = len(dataset) - int(len(dataset)*test_split_constant)
first_train_files, test_files = dataset[:-test_size], dataset[-test_size:]
print('train/val: ' + str(len(first_train_files)) + ' test: ' + str(len(test_files)))

save_file = open('/data/output/multi_data/hc_test.json', 'w')
value = { "test": test_files }
json.dump(value, save_file, indent=5)
save_file.close()

for fold_index in range(0, 10):
    dataset = first_train_files.copy()
    random.shuffle(dataset)

    val_split_constant = 0.8
    val_size = len(dataset) - int(len(dataset)*val_split_constant)
    train_files, val_files = dataset[:-val_size], dataset[-val_size:]

    if fold_index == 0:
        print('train: ' + str(len(train_files)) + ' val: ' + str(len(val_files)))

    save_file = open(
        '/data/output/multi_data/hc_train_val' + str(fold_index) + '.json',
        'w'
    )
    value = {
        "training": train_files,
        "validation": val_files
    }
    json.dump(value, save_file, indent=5)
    save_file.close()

total: 101
train/val: 80 test: 21
train: 64 val: 16
