In [1]:
import os
import pandas as pd
import numpy as np
import random
from utils import from_np_array
import json

In [2]:
os.listdir('/data/output')

['hc_clean_nodules.csv',
 'hc_nodules.csv',
 'hc_test.json',
 'hc_train_val.json',
 'msd_clean_nodules.csv',
 'msd_nodules.csv']

In [3]:
content = pd.read_csv(
    '/data/output/hc_clean_nodules.csv',
    converters={
        'spacing': from_np_array,
        'origin': from_np_array
    }
)
print(content.shape)
content.head()

(101, 10)


Unnamed: 0,name,spacing,origin,exam_refined,x_min,y_min,z_min,x_max,y_max,z_max
0,PL032303325116347,"[1.25, 0.703125, 0.703125]","[-239.75, -237.5, -185.699997]",False,311,312,62,337,341,76
1,PL061850842690412,"[1.0, 0.976562, 0.976562]","[-241.5, -109.0, -244.0]",False,161,302,278,186,328,301
2,PL064775189851313,"[0.75, 0.921875, 0.921875]","[-13.5, -99.0, -237.0]",False,328,313,324,355,337,346
3,PL070153620449075,"[1.0, 0.976562, 0.976562]","[16.5, -130.0, -253.0]",False,140,230,276,195,275,310
4,PL075999918976403,"[1.0000001, 0.902344, 0.902344]","[356.999969, -153.0, -243.0]",False,175,298,222,235,352,272


### based on bounding box found, finding the center and diameter of the nodules

In [4]:
hc_dataset = []
for hc_bb in content.itertuples(index=False):

    voxel_min = np.array((hc_bb[4], hc_bb[5], hc_bb[6]))
    voxel_max = np.array((hc_bb[7], hc_bb[8], hc_bb[9]))

    origin = np.array(list(reversed(hc_bb[2])))
    spacing = np.array(list(reversed(hc_bb[1])))

    world_coord_min = origin + voxel_min*spacing
    world_coord_max = origin + voxel_max*spacing

    world_coord_cent = (world_coord_min + world_coord_max)/2
    world_coord_diam = world_coord_min - world_coord_max
    
    hc_dataset.append({
        "box": [[
                world_coord_cent[0],
                world_coord_cent[1],
                world_coord_cent[2],
                world_coord_diam[0],
                world_coord_diam[1],
                world_coord_diam[2]
            ]],
            "image": hc_bb[0] + '/303 Unnamed Series.nrrd',
            "label": [0]
    })

### data split 

In [5]:
random.shuffle(hc_dataset)
print('total: ' + str(len(hc_dataset)))
test_split_constant = 0.8
test_size = len(hc_dataset) - int(len(hc_dataset)*test_split_constant)
train_files, test_files = hc_dataset[:-test_size], hc_dataset[-test_size:]
print('train/val: ' + str(len(train_files)) + ' test: ' + str(len(test_files)))
hc_dataset = train_files.copy()

val_split_constant = 0.8
val_size = len(hc_dataset) - int(len(hc_dataset)*val_split_constant)
train_files, val_files = hc_dataset[:-val_size], hc_dataset[-val_size:]
print('train: ' + str(len(train_files)) + ' val: ' + str(len(val_files)))

total: 101
train/val: 80 test: 21
train: 64 val: 16


### saving corresponding json for test

In [6]:
save_file = open('/data/output/hc_test.json', 'w')
value = {
    "test": test_files
}
json.dump(value, save_file, indent=5)
save_file.close()

### saving corresponding json for train and val

In [7]:
save_file = open('/data/output/hc_train_val.json', 'w')
value = {
    "training": train_files,
    "validation": val_files
}
json.dump(value, save_file, indent=5)
save_file.close()