In [1]:
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

Global variables:

In [15]:
DATA_BASE_PATH = '/data/'
PNG_DIR = 'png_256/'
SEED = 2020
DEBUG_MODE = True

NUM_SUBJ_PER_SPLIT = 50

## Load CSVs

### Load the extracted image meta data

In [3]:
train_img_stats = pd.read_csv(DATA_BASE_PATH + PNG_DIR + 'train_img_stats.csv')
train_img_stats = train_img_stats.set_index('filename')

In [4]:
print(
    "In the training set there are...\n"
    "{} images corresponding to \n"
    "{} studies, \n"
    "{} series, and \n"
    "{} patients.".format(
        train_img_stats.shape[0],
        train_img_stats['study_instance_ID'].unique().size,
        train_img_stats['series_instance_ID'].unique().size,
        train_img_stats['patient_ID'].unique().size)
)

print(
    "That is, {} images per patient on average.".format(
        train_img_stats.shape[0] / train_img_stats['patient_ID'].unique().size)
)

In the training set there are...
752802 images corresponding to 
21744 studies, 
21744 series, and 
18938 patients.
That is, 39.75087126412504 images per patient on average.


This is the Stage 2 training dataset (752802 images, while the Stage 1 training set includes 674262 images).

In [5]:
train_img_stats.head()

Unnamed: 0_level_0,patient_ID,study_instance_ID,series_instance_ID,study_ID,bits_allocated,bits_stored,pixel_representation,window_center,window_width,intercept,...,pixel_perc_50.5,pixel_perc_56.0,pixel_perc_61.5,pixel_perc_67.0,pixel_perc_72.5,pixel_perc_78.0,pixel_perc_83.5,pixel_perc_89.0,pixel_perc_94.5,pixel_perc_100.0
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_ee9b0a202.png,ID_ea32a910,ID_1790cc3a1f,ID_e151adeb6a,,16,12,0,36,80,-1024,...,-997.0,-988.0,-869.0,-719.0,-204.0,-48.0,2.0,26.0,79.0,1323.0
ID_4166af295.png,ID_72a9811c,ID_4863457d29,ID_02da87fe38,,16,12,0,36,80,-1024,...,-1001.0,-1000.0,-999.0,-998.0,-996.0,-993.0,-961.0,-922.0,-800.0,340.0
ID_9905e72aa.png,ID_6b6db689,ID_59997b389c,ID_07e6ba26b7,,16,16,1,30,80,-1024,...,-1006.0,-1006.0,-1005.0,-1003.0,-1000.0,-978.0,-973.0,-966.0,-922.0,397.0
ID_cb6cbd668.png,ID_8cccd24b,ID_65aec600cf,ID_693d27318e,,16,16,1,30,80,-1024,...,-932.0,-828.0,-65.0,17.0,28.0,33.0,37.0,46.0,257.0,1809.0
ID_41eb170e8.png,ID_819de77c,ID_160028dd74,ID_906ccc12e2,,16,12,0,40,80,-1024,...,-936.0,-913.0,-833.0,-220.0,-82.0,-32.0,10.0,38.0,176.0,1473.0


In [6]:
train_img_stats.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
study_ID,0.0,,,,,,,
bits_allocated,752802.0,16.0,0.0,16.0,16.0,16.0,16.0,16.0
bits_stored,752802.0,14.215972,1.988306,12.0,12.0,16.0,16.0,16.0
pixel_representation,752802.0,0.557064,0.496733,0.0,0.0,1.0,1.0,1.0
window_center,752802.0,35.47335,19.404427,25.0,30.0,36.0,36.0,800.0
window_width,752802.0,93.97396,116.141843,26.0,80.0,80.0,80.0,4095.0
intercept,752802.0,-1016.460623,86.316635,-1024.0,-1024.0,-1024.0,-1024.0,1.0
slope,752802.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
pixel_perc_1.0,752802.0,-1930.243065,946.213481,-31933.0,-3024.0,-2048.0,-1008.0,-948.0
pixel_perc_6.5,752802.0,-1928.105226,947.923986,-21707.705,-3024.0,-2048.0,-1004.0,-784.0


### Load the image labels

In [7]:
pivot_df = pd.read_csv(DATA_BASE_PATH + PNG_DIR + 'pivot_df.csv')
del pivot_df['Unnamed: 0']
pivot_df = pivot_df.set_index('filename')
pivot_df.head()

Unnamed: 0_level_0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ID_000012eaf.png,0,0,0,0,0,0
ID_000039fa0.png,0,0,0,0,0,0
ID_00005679d.png,0,0,0,0,0,0
ID_00008ce3c.png,0,0,0,0,0,0
ID_0000950d7.png,0,0,0,0,0,0


In [8]:
"Out of {} images there are {} IHC positive images.".format(
    pivot_df.shape[0],
    np.array(pivot_df['any'], dtype="int").sum()
)

'Out of 752802 images there are 107933 IHC positive images.'

### Combine the two data frames

In [9]:
print(train_img_stats.shape)
print(pivot_df.shape)

(752802, 30)
(752802, 6)


In [10]:
image_df = train_img_stats.join(pivot_df)
image_df.shape

(752802, 36)

In [11]:
image_df.head()

Unnamed: 0_level_0,patient_ID,study_instance_ID,series_instance_ID,study_ID,bits_allocated,bits_stored,pixel_representation,window_center,window_width,intercept,...,pixel_perc_83.5,pixel_perc_89.0,pixel_perc_94.5,pixel_perc_100.0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_ee9b0a202.png,ID_ea32a910,ID_1790cc3a1f,ID_e151adeb6a,,16,12,0,36,80,-1024,...,2.0,26.0,79.0,1323.0,0,0,0,0,0,0
ID_4166af295.png,ID_72a9811c,ID_4863457d29,ID_02da87fe38,,16,12,0,36,80,-1024,...,-961.0,-922.0,-800.0,340.0,0,0,0,0,0,0
ID_9905e72aa.png,ID_6b6db689,ID_59997b389c,ID_07e6ba26b7,,16,16,1,30,80,-1024,...,-973.0,-966.0,-922.0,397.0,0,0,0,0,0,0
ID_cb6cbd668.png,ID_8cccd24b,ID_65aec600cf,ID_693d27318e,,16,16,1,30,80,-1024,...,37.0,46.0,257.0,1809.0,0,0,0,0,0,0
ID_41eb170e8.png,ID_819de77c,ID_160028dd74,ID_906ccc12e2,,16,12,0,40,80,-1024,...,10.0,38.0,176.0,1473.0,0,0,0,0,0,0


In [12]:
patient_df = (image_df['any']
              .groupby(image_df.patient_ID)
              .agg(['count', 'sum'])
              .reset_index())
patient_df['any'] = np.array(patient_df['sum'].values > 0, dtype='int')

print(patient_df.describe())
print(patient_df.shape)
patient_df.head()

              count           sum           any
count  18938.000000  18938.000000  18938.000000
mean      39.750871      5.699282      0.404055
std       22.448687     10.040475      0.490721
min       20.000000      0.000000      0.000000
25%       32.000000      0.000000      0.000000
50%       34.000000      0.000000      0.000000
75%       40.000000     10.000000      1.000000
max      548.000000    167.000000      1.000000
(18938, 4)


Unnamed: 0,patient_ID,count,sum,any
0,ID_0002cd41,36,0,0
1,ID_00054f3f,31,0,0
2,ID_0006d192,40,0,0
3,ID_00086119,40,0,0
4,ID_000e5623,67,0,0


So, about 40% of patients are IHC positive.
Each patient has about 40 images on average with a standard deviation of about 22.
The lowest number of images that a patient has is 20.

In [13]:
"{} patients of {} are IHC positive.".format(
    patient_df['any'].values.sum(),
    patient_df.shape[0]
)

'7652 patients of 18938 are IHC positive.'

We will split the dataset into subsets of about `NUM_SUBJ_PER_SPLIT` patients.

In [14]:
num_splits = patient_df.shape[0] // NUM_SUBJ_PER_SPLIT
print(num_splits)
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=SEED)
skf_split = skf.split(X=patient_df['patient_ID'].values, y=patient_df['any'].values)

378


In [16]:
group_arr = np.zeros(patient_df.shape[0], dtype="int")
group_size_arr = np.zeros(patient_df.shape[0], dtype="int")

for i, (_, test_idx) in enumerate(skf_split):
    idx = list(test_idx)
    idx_size = len(idx)
    if DEBUG_MODE:
        print(idx, end="")
        print(" | length: {}".format(idx_size))
    group_arr[idx] = i
    group_size_arr[idx] = idx_size
    
patient_df['group'] = group_arr
patient_df['group_size'] = group_size_arr

[93, 400, 808, 1223, 1694, 1760, 2683, 2729, 3170, 3646, 4297, 4305, 4998, 5057, 5556, 6250, 6347, 7065, 7862, 7868, 8337, 8843, 9555, 9738, 10029, 10965, 11352, 11524, 11609, 12065, 12686, 13036, 13284, 13501, 13580, 13617, 13900, 14009, 14230, 14411, 14424, 16392, 17001, 17008, 17163, 17676, 17795, 17849, 18021, 18732, 18916] | length: 51
[178, 222, 321, 1192, 1312, 2107, 2197, 2519, 2557, 2692, 2849, 3365, 3951, 4001, 4257, 5275, 5842, 5974, 6027, 6198, 6423, 6637, 7035, 7123, 7638, 8199, 9298, 9392, 9755, 10284, 10836, 11111, 11590, 11741, 12252, 13751, 13791, 14111, 14152, 14699, 15605, 15652, 16159, 16280, 17611, 17728, 17929, 18035, 18616, 18872, 18879] | length: 51
[262, 1257, 1560, 1564, 2584, 2862, 2902, 3624, 3859, 4096, 4275, 4468, 4756, 4880, 4912, 4976, 5117, 5328, 5849, 6073, 7440, 8484, 8705, 8772, 8884, 9013, 10010, 10525, 10911, 11675, 12288, 12652, 13013, 13371, 13620, 14352, 14357, 14366, 14919, 15159, 15330, 15373, 15402, 15912, 16139, 16566, 16719, 17635, 17730, 1

In [18]:
# sanity check
if DEBUG_MODE:
    pd.set_option('display.max_rows', 150)
    print(patient_df.sort_values(by=['group', 'any'])[:150])
else:
    print(patient_df.sort_values(by='group'))

        patient_ID  count  sum  any  group  group_size
93     ID_013cf2a2     44    0    0      0          51
808    ID_0a848db2     48    0    0      0          51
1694   ID_16d5c059     35    0    0      0          51
3170   ID_29f7dd66     80    0    0      0          51
3646   ID_30a5abba     34    0    0      0          51
4305   ID_39d9bb97     28    0    0      0          51
5057   ID_43f45c2b     33    0    0      0          51
5556   ID_4a9f6617     40    0    0      0          51
6250   ID_542aa80e     40    0    0      0          51
6347   ID_55a2ce03     28    0    0      0          51
7868   ID_6a1be720     32    0    0      0          51
8337   ID_70059bda     32    0    0      0          51
8843   ID_77637b50     46    0    0      0          51
10965  ID_92bdbed5     40    0    0      0          51
11352  ID_97eb51e8     33    0    0      0          51
11524  ID_9aa08eb2     33    0    0      0          51
11609  ID_9bbfc1ec     40    0    0      0          51
12686  ID_

### 100 training sets, 100 (reusable) test sets, and one large "lock-box" test set for the ThresholdoutAUC experiments

- Keep 200 of the groups of size `NUM_SUBJ_PER_SPLIT` as the training and testing datasets.
- Join the remaining groups together into a large "lock-box" testing dataset.

Number of groups of each size:

In [19]:
group_counts = patient_df['group'].value_counts().value_counts()
# (the first value_counts() counts number of items per each of the 378 groups,
# the second value_counts() then counts the number of groups of each size respectively)
group_counts = group_counts.to_dict()
group_counts

{50: 232, 51: 92, 49: 54}

In [20]:
group_df = (patient_df[['group', 'group_size']]
            .drop_duplicates()
            .sort_values(by='group')
            .reset_index(drop=True))
group_df

Unnamed: 0,group,group_size
0,0,51
1,1,51
2,2,51
3,3,51
4,4,51
...,...,...
373,373,49
374,374,49
375,375,49
376,376,49


In [21]:
random.seed(SEED)
idx = random.sample(range(group_counts[NUM_SUBJ_PER_SPLIT]), 200)

train_groups = list(
    group_df['group'][group_df['group_size'] == NUM_SUBJ_PER_SPLIT]
    .values[idx[:100]]
)
test_groups = list(
    group_df['group'][group_df['group_size'] == NUM_SUBJ_PER_SPLIT]
    .values[idx[100:]]
)

# sanity check
print(train_groups[:5])
print(np.unique(train_groups).shape)
print(test_groups[:5])
print(np.unique(test_groups).shape)

[250, 323, 136, 263, 288]
(100,)
[172, 280, 99, 214, 194]
(100,)


In [22]:
group_df['dataset'] = pd.Series(['unknown' for _ in range(group_df.shape[0])])
group_df.loc[group_df['group'].isin(train_groups), 'dataset'] = "train"
group_df.loc[group_df['group'].isin(test_groups), 'dataset'] = "test"
group_df.loc[~group_df['dataset'].isin(["train", "test"]), 'dataset'] = "lockbox"

# sanity check
print(group_df['dataset'].value_counts())
print(group_df[group_df.dataset.isin(["train", "test"])]
      .group_size.value_counts())
group_df

lockbox    178
test       100
train      100
Name: dataset, dtype: int64
50    200
Name: group_size, dtype: int64


Unnamed: 0,group,group_size,dataset
0,0,51,lockbox
1,1,51,lockbox
2,2,51,lockbox
3,3,51,lockbox
4,4,51,lockbox
...,...,...,...
373,373,49,lockbox
374,374,49,lockbox
375,375,49,lockbox
376,376,49,lockbox


#### Save three CSV files, one for the training datasets, one for the test datasets, and one for the "lock box" data

In [23]:
patient_df = pd.merge(
    patient_df, group_df, how="left",
    on=["group", "group_size"]
)
patient_df.rename(columns={"any": "any(patient-level)"}, inplace=True)
patient_df

Unnamed: 0,patient_ID,count,sum,any(patient-level),group,group_size,dataset
0,ID_0002cd41,36,0,0,292,50,test
1,ID_00054f3f,31,0,0,63,51,lockbox
2,ID_0006d192,40,0,0,33,51,lockbox
3,ID_00086119,40,0,0,115,50,train
4,ID_000e5623,67,0,0,46,51,lockbox
...,...,...,...,...,...,...,...
18933,ID_ffedaf23,37,0,0,203,50,test
18934,ID_ffee3094,34,0,0,263,50,train
18935,ID_fff140ff,36,6,1,263,50,train
18936,ID_fff502d5,36,11,1,76,51,lockbox


In [24]:
# sanity check
patient_df.isnull().values.any()

False

In [25]:
image_df.reset_index(inplace=True)
print(image_df.shape)
image_df.head()

(752802, 37)


Unnamed: 0,filename,patient_ID,study_instance_ID,series_instance_ID,study_ID,bits_allocated,bits_stored,pixel_representation,window_center,window_width,...,pixel_perc_83.5,pixel_perc_89.0,pixel_perc_94.5,pixel_perc_100.0,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_ee9b0a202.png,ID_ea32a910,ID_1790cc3a1f,ID_e151adeb6a,,16,12,0,36,80,...,2.0,26.0,79.0,1323.0,0,0,0,0,0,0
1,ID_4166af295.png,ID_72a9811c,ID_4863457d29,ID_02da87fe38,,16,12,0,36,80,...,-961.0,-922.0,-800.0,340.0,0,0,0,0,0,0
2,ID_9905e72aa.png,ID_6b6db689,ID_59997b389c,ID_07e6ba26b7,,16,16,1,30,80,...,-973.0,-966.0,-922.0,397.0,0,0,0,0,0,0
3,ID_cb6cbd668.png,ID_8cccd24b,ID_65aec600cf,ID_693d27318e,,16,16,1,30,80,...,37.0,46.0,257.0,1809.0,0,0,0,0,0,0
4,ID_41eb170e8.png,ID_819de77c,ID_160028dd74,ID_906ccc12e2,,16,12,0,40,80,...,10.0,38.0,176.0,1473.0,0,0,0,0,0,0


In [26]:
image_df = pd.merge(
    image_df, patient_df, how="left",
    on="patient_ID"
)
print(image_df.dataset.value_counts())
print(image_df.drop("study_ID", axis=1).isnull().values.any())
print(list(image_df.columns))
image_df

lockbox    355878
train      198543
test       198381
Name: dataset, dtype: int64
False
['filename', 'patient_ID', 'study_instance_ID', 'series_instance_ID', 'study_ID', 'bits_allocated', 'bits_stored', 'pixel_representation', 'window_center', 'window_width', 'intercept', 'slope', 'pixel_perc_1.0', 'pixel_perc_6.5', 'pixel_perc_12.0', 'pixel_perc_17.5', 'pixel_perc_23.0', 'pixel_perc_28.5', 'pixel_perc_34.0', 'pixel_perc_39.5', 'pixel_perc_45.0', 'pixel_perc_50.5', 'pixel_perc_56.0', 'pixel_perc_61.5', 'pixel_perc_67.0', 'pixel_perc_72.5', 'pixel_perc_78.0', 'pixel_perc_83.5', 'pixel_perc_89.0', 'pixel_perc_94.5', 'pixel_perc_100.0', 'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'count', 'sum', 'any(patient-level)', 'group', 'group_size', 'dataset']


Unnamed: 0,filename,patient_ID,study_instance_ID,series_instance_ID,study_ID,bits_allocated,bits_stored,pixel_representation,window_center,window_width,...,intraparenchymal,intraventricular,subarachnoid,subdural,count,sum,any(patient-level),group,group_size,dataset
0,ID_ee9b0a202.png,ID_ea32a910,ID_1790cc3a1f,ID_e151adeb6a,,16,12,0,36,80,...,0,0,0,0,106,19,1,309,50,train
1,ID_4166af295.png,ID_72a9811c,ID_4863457d29,ID_02da87fe38,,16,12,0,36,80,...,0,0,0,0,52,0,0,163,50,test
2,ID_9905e72aa.png,ID_6b6db689,ID_59997b389c,ID_07e6ba26b7,,16,16,1,30,80,...,0,0,0,0,32,0,0,354,49,lockbox
3,ID_cb6cbd668.png,ID_8cccd24b,ID_65aec600cf,ID_693d27318e,,16,16,1,30,80,...,0,0,0,0,40,0,0,164,50,lockbox
4,ID_41eb170e8.png,ID_819de77c,ID_160028dd74,ID_906ccc12e2,,16,12,0,40,80,...,0,0,0,0,30,18,1,223,50,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752797,ID_5c685fee9.png,ID_12e40fcb,ID_d9289716fa,ID_b32d2017d3,,16,16,1,30,80,...,0,0,0,0,40,0,0,219,50,lockbox
752798,ID_bdee7e35f.png,ID_e28a3e08,ID_7bb8571fc9,ID_57fe308b71,,16,16,1,30,80,...,0,0,0,0,88,4,1,103,50,train
752799,ID_2d1ba36eb.png,ID_9084a0f3,ID_3186490652,ID_60910114a1,,16,12,0,40,80,...,0,0,0,0,32,0,0,31,51,lockbox
752800,ID_9cd9ee1b2.png,ID_0d102f99,ID_d8ac405c28,ID_6736d1cddc,,16,16,1,30,80,...,0,0,0,0,108,0,0,319,50,train


In [28]:
train_df = image_df[image_df['dataset'] == "train"]
test_df = image_df[image_df['dataset'] == "test"]
lockbox_df = image_df[image_df['dataset'] == "lockbox"]

In [29]:
from pandas.api.types import CategoricalDtype
def recode_group_idx(df):
    cats = sorted(np.unique(df.group.values))
    n_cats = len(cats)
    cat_type = CategoricalDtype(
        categories=cats,
        ordered=True
    )
    a = df.group.astype(cat_type, copy=True)
    a = a.cat.rename_categories(list(range(n_cats)))
    return a.astype("int")

train_df.loc[:, "group"] = recode_group_idx(train_df)
test_df.loc[:, "group"] = recode_group_idx(test_df)
lockbox_df.loc[:, "group"] = recode_group_idx(lockbox_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [43]:
# sanity checks 1

# check that train groups don't overlap with respect to patient_ID
for i, pid in enumerate(np.unique(train_df.patient_ID.values)):
    if i % 1000 == 0:
        print(i, end=",")
    assert np.unique(
        train_df.loc[train_df.patient_ID == pid, :].group.values
    ).shape == (1,)
    
# check that test groups don't overlap with respect to patient_ID
for i, pid in enumerate(np.unique(test_df.patient_ID.values)):
    if i % 1000 == 0:
        print(i, end=",")
    assert np.unique(
        test_df.loc[test_df.patient_ID == pid, :].group.values
    ).shape == (1,)

0,1000,2000,3000,4000,0,1000,2000,3000,4000,

In [44]:
# sanity checks 2

from collections import Counter

# all training datasets should consist of NUM_SUBJ_PER_SPLIT patients each
m = (train_df
 .groupby(train_df.patient_ID)
 .agg(['mean'])
 .loc[:, "group"]
 .sort_values(by=["mean"])
)["mean"].values
print(all([v == NUM_SUBJ_PER_SPLIT for v in Counter(m).values()]))  # each group includes exactly NUM_SUBJ_PER_SPLIT patients

# all testing datasets should consist of NUM_SUBJ_PER_SPLIT patients each
m = (test_df
 .groupby(test_df.patient_ID)
 .agg(['mean'])
 .loc[:, "group"]
 .sort_values(by=["mean"])
)["mean"].values
print(all([v == NUM_SUBJ_PER_SPLIT for v in Counter(m).values()]))

# the lockbox data contains all the remaining groups
m = (lockbox_df
 .groupby(lockbox_df.patient_ID)
 .agg(['mean'])
 .loc[:, "group"]
 .sort_values(by=["mean"]))["mean"].values
print(Counter(list(Counter(m).values())))

True
True
Counter({51: 92, 49: 54, 50: 32})


In [45]:
# save the dataframes to disk
train_splits_csv = 'train_splits_' + str(NUM_SUBJ_PER_SPLIT) + '.csv'
train_df.to_csv(DATA_BASE_PATH + PNG_DIR + train_splits_csv)

test_splits_csv = 'test_splits_' + str(NUM_SUBJ_PER_SPLIT) + '.csv'
test_df.to_csv(DATA_BASE_PATH + PNG_DIR + test_splits_csv)

lockbox_splits_csv = 'lockbox_splits_' + str(NUM_SUBJ_PER_SPLIT) + '.csv'
lockbox_df.to_csv(DATA_BASE_PATH + PNG_DIR + lockbox_splits_csv)

In [47]:
if DEBUG_MODE:
    print(pd.read_csv(DATA_BASE_PATH + PNG_DIR + train_splits_csv))
    print(pd.read_csv(DATA_BASE_PATH + PNG_DIR + test_splits_csv))
    print(pd.read_csv(DATA_BASE_PATH + PNG_DIR + lockbox_splits_csv))

        Unnamed: 0          filename   patient_ID study_instance_ID  \
0                0  ID_ee9b0a202.png  ID_ea32a910     ID_1790cc3a1f   
1                4  ID_41eb170e8.png  ID_819de77c     ID_160028dd74   
2                7  ID_e849bf0c7.png  ID_010c18ba     ID_8da1ab563c   
3                8  ID_f308762e4.png  ID_a95e54ce     ID_3a5287d677   
4               10  ID_667d76de4.png  ID_e3ac70cb     ID_9375ebec9a   
...            ...               ...          ...               ...   
198538      752782  ID_162422044.png  ID_67afc15c     ID_b13a3a2a6d   
198539      752786  ID_dd7cc3550.png  ID_837abeb8     ID_f4977f7746   
198540      752796  ID_391c0cf9d.png  ID_ce94665b     ID_80d70ffcbe   
198541      752798  ID_bdee7e35f.png  ID_e28a3e08     ID_7bb8571fc9   
198542      752800  ID_9cd9ee1b2.png  ID_0d102f99     ID_d8ac405c28   

       series_instance_ID  study_ID  bits_allocated  bits_stored  \
0           ID_e151adeb6a       NaN              16           12   
1          