In [1]:
import os
import pandas as pd
import numpy as np
import random
from utils import from_np_array
import json
from monai.data.box_utils import convert_box_mode

In [2]:
content = pd.read_csv(
    '/data/output/msd_clean_nodules3.csv',
    converters={
        'spacing': from_np_array,
        'origin': from_np_array
    }
)
print(content.shape)
content.head()

(63, 10)


Unnamed: 0,name,spacing,origin,exam_refined,x_min,y_min,z_min,x_max,y_max,z_max
0,lung_001,"[1.25, 0.703125, 0.703125]","[-305.0, 40.15332031, 172.15332031]",False,371,255,185,392,273,199
1,lung_003,"[1.25, 0.703125, 0.703125]","[-377.75, 232.86250305, 226.1625061]",False,148,314,138,166,338,147
2,lung_004,"[1.25, 0.703125, 0.703125]","[-410.5, 211.171875, 215.2718811]",False,201,371,217,222,396,231
3,lung_005,"[1.25, 0.703125, 0.703125]","[-326.0, 0.172851562, 171.172852]",False,326,250,124,367,289,145
4,lung_006,"[1.25, 0.703125, 0.703125]","[-360.25, 229.02319336, 249.02319336]",False,277,259,224,312,294,244


### based on bounding box found, finding the center and diameter of the nodules

In [3]:
msd_dataset = []
for hc_bb in content.itertuples(index=False):

    ### first attempt
    # voxel_min = np.array((hc_bb[4], hc_bb[5], hc_bb[6]))
    # voxel_max = np.array((hc_bb[7], hc_bb[8], hc_bb[9]))

    origin = np.array(list(reversed(hc_bb[2])))
    spacing = np.array(list(reversed(hc_bb[1])))

    # world_coord_min = origin + voxel_min*spacing
    # world_coord_max = origin + voxel_max*spacing

    # world_coord_cent = (world_coord_min + world_coord_max)/2
    # world_coord_diam = world_coord_min - world_coord_max

    ### second attempt
    voxel_min = np.array((hc_bb[4], hc_bb[5], hc_bb[6]))
    voxel_max = np.array((hc_bb[7], hc_bb[8], hc_bb[9]))

    print(voxel_min, voxel_max)
    voxel_coord_cent = voxel_min + (np.absolute(voxel_min - voxel_max)/2)

    print(voxel_coord_cent, hc_bb[0], origin, spacing)
    
    voxel_coord_diam = voxel_max - voxel_min

    voxel_coord_cent[2] = voxel_coord_cent[2] * -1

    world_coord_cent = ((voxel_coord_cent * spacing) - origin ) * -1
    world_coord_diam = voxel_coord_diam * spacing
    
    msd_dataset.append({
        "box": [[
                world_coord_cent[0],
                world_coord_cent[1],
                world_coord_cent[2],
                world_coord_diam[0],
                world_coord_diam[1],
                world_coord_diam[2]
            ]],
            "image": hc_bb[0] + '/' + hc_bb[0] + '.nii.gz',
            "label": [0]
    })

[371 255 185] [392 273 199]
[381.5 264.  192. ] lung_001 [ 172.15332031   40.15332031 -305.        ] [0.703125 0.703125 1.25    ]
[148 314 138] [166 338 147]
[157.  326.  142.5] lung_003 [ 226.1625061   232.86250305 -377.75      ] [0.703125 0.703125 1.25    ]
[201 371 217] [222 396 231]
[211.5 383.5 224. ] lung_004 [ 215.2718811  211.171875  -410.5      ] [0.703125 0.703125 1.25    ]
[326 250 124] [367 289 145]
[346.5 269.5 134.5] lung_005 [ 1.71172852e+02  1.72851562e-01 -3.26000000e+02] [0.703125 0.703125 1.25    ]
[277 259 224] [312 294 244]
[294.5 276.5 234. ] lung_006 [ 249.02319336  229.02319336 -360.25      ] [0.703125 0.703125 1.25    ]
[320 244 205] [404 332 265]
[362. 288. 235.] lung_009 [ 213.87944031  220.57943726 -350.75      ] [0.703125 0.703125 1.25    ]
[132 102  87] [177 152 119]
[154.5 127.  103. ] lung_010 [ 161.20222473  152.40222168 -283.        ] [0.703125 0.703125 1.25    ]
[362 150 167] [435 211 202]
[398.5 180.5 184.5] lung_014 [ 182.75805664  189.25805664 -377

### data split 

In [4]:
random.shuffle(msd_dataset)
print('total: ' + str(len(msd_dataset)))
test_split_constant = 0.8
test_size = len(msd_dataset) - int(len(msd_dataset)*test_split_constant)
train_files, test_files = msd_dataset[:-test_size], msd_dataset[-test_size:]
print('train/val: ' + str(len(train_files)) + ' test: ' + str(len(test_files)))
msd_dataset = train_files.copy()

val_split_constant = 0.8
val_size = len(msd_dataset) - int(len(msd_dataset)*val_split_constant)
train_files, val_files = msd_dataset[:-val_size], msd_dataset[-val_size:]
print('train: ' + str(len(train_files)) + ' val: ' + str(len(val_files)))

total: 63
train/val: 50 test: 13
train: 40 val: 10


### saving corresponding json for test

In [5]:
save_file = open('/data/output/msd_test3.json', 'w')
value = {
    "test": test_files
}
json.dump(value, save_file, indent=5)
save_file.close()

### saving corresponding json for train and val

In [6]:
save_file = open('/data/output/msd_train_val3.json', 'w')
value = {
    "training": train_files,
    "validation": val_files
}
json.dump(value, save_file, indent=5)
save_file.close()