In [1]:
import os
import pandas as pd
import numpy as np
import random
from utils import from_np_array
import json

In [2]:
os.listdir('/data/output')

['hc_clean_nodules.csv',
 'hc_nodules.csv',
 'hc_test.json',
 'hc_train_val.json',
 'msd_clean_nodules.csv',
 'msd_nodules.csv',
 'msd_test.json',
 'msd_train_val.json']

In [3]:
content = pd.read_csv(
    '/data/output/msd_clean_nodules.csv',
    converters={
        'spacing': from_np_array,
        'origin': from_np_array
    }
)
print(content.shape)
content.head()

(63, 10)


Unnamed: 0,name,spacing,origin,exam_refined,x_min,y_min,z_min,x_max,y_max,z_max
0,lung_001,"[1.0, 0.69335938, 0.69335938]","[-305.0, 40.15332031, -182.15332031]",False,113,257,231,139,278,251
1,lung_003,"[1.24565971, 0.9375, 0.9375]","[-377.75, 232.86250305, -252.8999939]",False,386,234,138,402,255,148
2,lung_004,"[1.245471, 0.828125, 0.828125]","[-410.5, 211.171875, -207.8999939]",False,323,314,217,343,337,232
3,lung_005,"[1.0, 0.65429688, 0.65429688]","[-326.0, 0.172851562, -163.172852]",False,116,267,155,164,313,181
4,lung_006,"[0.625, 0.97656202, 0.97656202]","[-360.25, 229.02319336, -250.0]",False,285,186,447,313,213,487


### based on bounding box found, finding the center and diameter of the nodules

In [4]:
hc_dataset = []
for hc_bb in content.itertuples(index=False):

    voxel_min = np.array((hc_bb[4], hc_bb[5], hc_bb[6]))
    voxel_max = np.array((hc_bb[7], hc_bb[8], hc_bb[9]))

    origin = np.array(list(reversed(hc_bb[2])))
    spacing = np.array(list(reversed(hc_bb[1])))

    world_coord_min = origin + voxel_min*spacing
    world_coord_max = origin + voxel_max*spacing

    world_coord_cent = (world_coord_min + world_coord_max)/2
    world_coord_diam = world_coord_min - world_coord_max
    
    hc_dataset.append({
        "box": [[
                world_coord_cent[0],
                world_coord_cent[1],
                world_coord_cent[2],
                world_coord_diam[0],
                world_coord_diam[1],
                world_coord_diam[2]
            ]],
            "image": hc_bb[0] + '.nii.gz',
            "label": [0]
    })

### data split 

In [5]:
random.shuffle(hc_dataset)
print('total: ' + str(len(hc_dataset)))
test_split_constant = 0.8
test_size = len(hc_dataset) - int(len(hc_dataset)*test_split_constant)
train_files, test_files = hc_dataset[:-test_size], hc_dataset[-test_size:]
print('train/val: ' + str(len(train_files)) + ' test: ' + str(len(test_files)))
hc_dataset = train_files.copy()

val_split_constant = 0.8
val_size = len(hc_dataset) - int(len(hc_dataset)*val_split_constant)
train_files, val_files = hc_dataset[:-val_size], hc_dataset[-val_size:]
print('train: ' + str(len(train_files)) + ' val: ' + str(len(val_files)))

total: 63
train/val: 50 test: 13
train: 40 val: 10


### saving corresponding json for test

In [6]:
save_file = open('/data/output/msd_test.json', 'w')
value = {
    "test": test_files
}
json.dump(value, save_file, indent=5)
save_file.close()

### saving corresponding json for train and val

In [7]:
save_file = open('/data/output/msd_train_val.json', 'w')
value = {
    "training": train_files,
    "validation": val_files
}
json.dump(value, save_file, indent=5)
save_file.close()