# __Import & config__

In [1]:
%load_ext autoreload
%autoreload 2
import os
os.chdir('C:\\Users\\Usuario\\TFG\\digipanca\\')

In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import glob

# __Load data__

In [64]:
file_path = "Rtum.csv"
df = pd.read_csv(file_path, delimiter=';')
df

Unnamed: 0,CaseID,Tipo,Fase,Dificultad,Notas
0,Rtum1,ADC borderline,Portal,1,
1,Rtum2,ADC,Arterial,1,
2,Rtum3,ADC borderline,Doble,3,
3,Rtum4,ADC,Portal,1,
4,Rtum5,TNE,Portal,1,
...,...,...,...,...,...
83,Rtum84,TNE,Portal,1,
84,Rtum85,ADC borderline,Doble,1,
85,Rtum86,ADC,Portal,1,
86,Rtum87,ADC,Portal,2,


# __Stats__

## Type

In [65]:
print(df['Tipo'].value_counts())

Tipo
ADC                  44
ADC borderline       31
TNE                   9
NMPI                  1
ampuloma              1
Gist duodenal         1
colangiocarcinoma     1
Name: count, dtype: int64


We should include _IPNM_ (NMPI), _ampuloma_, _duodenal GIST_ (Gist duodenal) and _cholangiocarcinoma_ (colangiocarcinoma) as __Other__

In [66]:
df['clean_type'] = df['Tipo'].str.strip().str.lower()

type_map = {
    'adc': 'ADC',
    'adc borderline': 'ADC-b',
    'tne': 'TNE'
}

df['Simplified_type'] = df['clean_type'].map(type_map).fillna('Other')

print(df['Simplified_type'].value_counts())

Simplified_type
ADC      44
ADC-b    31
TNE       9
Other     4
Name: count, dtype: int64


## Phase

In [67]:
print(df['Fase'].value_counts())

Fase
Doble       42
Portal      27
Arterial    19
Name: count, dtype: int64


In [68]:
df['clean_phase'] = df['Fase'].str.strip().str.lower()

phase_map = {
    'doble': 'Portal',
    'portal': 'Portal',
    'arterial': 'Arterial'
}

df['Simplified_phase'] = df['clean_phase'].map(phase_map)

print(df['Simplified_phase'].value_counts())

Simplified_phase
Portal      69
Arterial    19
Name: count, dtype: int64


## Segmentation difficulty

In [69]:
print(df['Dificultad'].value_counts())

Dificultad
1    48
2    35
3     5
Name: count, dtype: int64


# __Stratify__

1. Create a composed label for stratifying

In [74]:
df['strata_a'] = df['Simplified_type'].astype(str) + '_' + df['Fase'].astype(str)
print(df['strata_a'].value_counts())

strata_a
ADC-b_Doble       18
ADC_Doble         18
ADC_Portal        16
ADC_Arterial      10
ADC-b_Portal       9
ADC-b_Arterial     4
TNE_Arterial       4
TNE_Doble          3
Other_Doble        3
TNE_Portal         2
Other_Arterial     1
Name: count, dtype: int64


In [62]:
df['strata_b'] = df['Simplified_type'].astype(str) + '_' + df['Dificultad'].astype(str)
print(df['strata_b'].value_counts())

strata_b
ADC_1      22
ADC_2      20
ADC-b_1    14
ADC-b_2    14
TNE_1       9
ADC-b_3     3
Other_1     3
ADC_3       2
Other_2     1
Name: count, dtype: int64


2. As there is no combination of labels that allows us to create valid splits, we will use the simplified type for stratifying.

In [63]:
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['Simplified_type']
)

In [72]:
print(train_df['Notas'].value_counts())

Notas
Regular    4
Name: count, dtype: int64


In [73]:
print(val_df['Notas'].value_counts())

Notas
Regular    1
Name: count, dtype: int64


In [76]:
print(train_df['Simplified_type'].value_counts(normalize=True).head())
print(val_df['Simplified_type'].value_counts(normalize=True).head())

Simplified_type
ADC      0.500000
ADC-b    0.357143
TNE      0.100000
Other    0.042857
Name: proportion, dtype: float64
Simplified_type
ADC      0.500000
ADC-b    0.333333
TNE      0.111111
Other    0.055556
Name: proportion, dtype: float64


# __Save datalist__

In [81]:
train_ids = train_df['CaseID'].tolist()
val_ids = val_df['CaseID'].tolist()
print(train_ids)
print(val_ids)

def format_id(case_id):
    num = case_id.replace('Rtum', '')
    return f'rtum{int(num):03d}'

train_ids = [format_id(cid) for cid in train_ids]
val_ids = [format_id(cid) for cid in val_ids]
print(train_ids)
print(val_ids)

['Rtum83', 'Rtum57', 'Rtum60', 'Rtum36', 'Rtum31', 'Rtum45', 'Rtum30', 'Rtum43', 'Rtum85', 'Rtum44', 'Rtum78', 'Rtum34', 'Rtum67', 'Rtum38', 'Rtum68', 'Rtum9', 'Rtum10', 'Rtum26', 'Rtum7', 'Rtum82', 'Rtum12', 'Rtum46', 'Rtum41', 'Rtum81', 'Rtum24', 'Rtum49', 'Rtum29', 'Rtum70', 'Rtum54', 'Rtum42', 'Rtum59', 'Rtum33', 'Rtum61', 'Rtum21', 'Rtum2', 'Rtum58', 'Rtum56', 'Rtum63', 'Rtum74', 'Rtum51', 'Rtum5', 'Rtum69', 'Rtum77', 'Rtum19', 'Rtum20', 'Rtum75', 'Rtum72', 'Rtum1', 'Rtum76', 'Rtum15', 'Rtum86', 'Rtum6', 'Rtum16', 'Rtum3', 'Rtum35', 'Rtum39', 'Rtum73', 'Rtum28', 'Rtum25', 'Rtum11', 'Rtum87', 'Rtum79', 'Rtum22', 'Rtum47', 'Rtum4', 'Rtum53', 'Rtum18', 'Rtum13', 'Rtum66', 'Rtum17']
['Rtum84', 'Rtum48', 'Rtum65', 'Rtum27', 'Rtum8', 'Rtum37', 'Rtum32', 'Rtum50', 'Rtum14', 'Rtum88', 'Rtum23', 'Rtum52', 'Rtum80', 'Rtum62', 'Rtum71', 'Rtum64', 'Rtum55', 'Rtum40']
['rtum083', 'rtum057', 'rtum060', 'rtum036', 'rtum031', 'rtum045', 'rtum030', 'rtum043', 'rtum085', 'rtum044', 'rtum078', 'rtum

In [82]:
data_dir = 'data/prepared'
output = 'data/splits/dataset_0.json'

In [83]:
def produce_sample_dict(line):
    return {"label": line, "image": line.replace("label", "image")}

In [89]:
train_list = []
val_list = []
test_list = []

for train_patient in train_ids:
    sample = os.path.join('labelsTr', f'{train_patient}.nii.gz')
    train_list.append(produce_sample_dict(sample))

for val_patient in val_ids:
    sample = os.path.join('labelsTr', f'{val_patient}.nii.gz')
    val_list.append(produce_sample_dict(sample))

test_samples = sorted(glob.glob(os.path.join(data_dir, "labelsTs", "*"), recursive=True))
test_samples = [_item.replace(os.path.join(data_dir, "labelsTs"), "labelsTs") for _item in test_samples]
for sample in test_samples:
    test_list.append(produce_sample_dict(sample))

In [90]:
# Create the final datalist
datalist = {
    "training": train_list,
    "validation": val_list,
    "test": test_list
}

# Save the datalist to a JSON file
with open(output, "w") as f:
    json.dump(datalist, f, ensure_ascii=True, indent=4)

# Save split

In [96]:
test_ids = [p.replace('.nii.gz', '') for p in os.listdir(os.path.join(data_dir, 'imagesTs'))]
print(test_ids)

['rtum089', 'rtum090', 'rtum091', 'rtum092', 'rtum093', 'rtum094', 'rtum095', 'rtum096', 'rtum097', 'rtum098', 'rtum099', 'rtum100', 'rtum101', 'rtum102', 'rtum103', 'rtum104', 'rtum105', 'rtum106', 'rtum107', 'rtum108', 'rtum109', 'rtum110', 'rtum111', 'rtum112', 'rtum113', 'rtum114', 'rtum115', 'rtum116']


In [97]:
split = {
    "training": train_ids,
    "validation": val_ids,
    "test": test_ids
}

split_file = 'data/splits/split_for_dataset_0.json'
with open(split_file, "w") as f:
    json.dump(split, f, ensure_ascii=True, indent=4)