# Data prep

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import ImageFile
from tqdm import tqdm
from PIL import Image
import shutil


In [2]:
data_folder = Path("/data")

In [3]:
CAMELYON_16_FOLDER = data_folder / "tiles_Camelyon16_with_metadata_all"
CAMELYON_17_FOLDER = data_folder / "tiles_with_metadata_all"

In [4]:
CAMELYON_16_CSV = data_folder / "index_camelyon16_with_metadata_all.csv"
CAMELYON_17_CSV = data_folder / "index_with_metadata_all.csv"

## Data check

### Camelyon 17

In [5]:
df_17 = pd.read_csv(CAMELYON_17_CSV) 
df_17["camelyon_version"] = 17
df_17['absolute_path'] = df_17.apply(lambda x: CAMELYON_17_FOLDER / x.path, axis=1)
df_17['absolute_path'] = df_17.apply(lambda x: CAMELYON_17_FOLDER / x.path, axis=1)

In [6]:
df_17

Unnamed: 0,path,patient,hospital,target,node,camelyon_version,absolute_path
0,normal_center_0_patient_2_node_2_2608.tif,2,0,0,2,17,/data/tiles_with_metadata_all/normal_center_0_...
1,normal_center_0_patient_2_node_2_2268.tif,2,0,0,2,17,/data/tiles_with_metadata_all/normal_center_0_...
2,normal_center_0_patient_2_node_2_3120.tif,2,0,0,2,17,/data/tiles_with_metadata_all/normal_center_0_...
3,normal_center_0_patient_2_node_2_0236.tif,2,0,0,2,17,/data/tiles_with_metadata_all/normal_center_0_...
4,normal_center_0_patient_2_node_2_2458.tif,2,0,0,2,17,/data/tiles_with_metadata_all/normal_center_0_...
...,...,...,...,...,...,...,...
53633,normal_center_4_patient_88_node_0_6714.tif,88,4,0,0,17,/data/tiles_with_metadata_all/normal_center_4_...
53634,normal_center_4_patient_88_node_0_1412.tif,88,4,0,0,17,/data/tiles_with_metadata_all/normal_center_4_...
53635,normal_center_4_patient_88_node_0_1469.tif,88,4,0,0,17,/data/tiles_with_metadata_all/normal_center_4_...
53636,normal_center_4_patient_88_node_0_6548.tif,88,4,0,0,17,/data/tiles_with_metadata_all/normal_center_4_...


In [7]:
camelyon_17_images_count = len(list(CAMELYON_17_FOLDER.glob('*')))
camelyon_17_images_count

53638

In [8]:
assert(len(df_17) == camelyon_17_images_count)

### Camelyon 16

In [9]:
df_16 = pd.read_csv(CAMELYON_16_CSV) 
df_16["camelyon_version"] = 16
df_16['absolute_path'] = df_16.apply(lambda x: CAMELYON_16_FOLDER / x.path, axis=1)
df_16['patient'] = df_16.apply(lambda x: int(x.slide.split('_')[1]), axis=1)

In [10]:
df_16.head(20)

Unnamed: 0,path,slide,hospital,target,camelyon_version,absolute_path,patient
0,normal_0_6_0579.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
1,normal_0_6_0193.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
2,normal_0_6_0012.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
3,normal_0_6_0037.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
4,normal_0_6_0363.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
5,normal_0_6_0949.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
6,normal_0_6_0786.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
7,normal_0_6_0932.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
8,normal_0_6_0898.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6
9,normal_0_6_0581.tif,Normal_006,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,6


In [11]:
camelyon_16_images_count = len(list(CAMELYON_16_FOLDER.glob('*')))
camelyon_16_images_count

231001

In [12]:
assert(len(df_16) == camelyon_16_images_count) # missing images

## Data split
In this section we create the dataframe for every train/test dataset for the ML and the FL challenges

*TODO*: don't put same slide/patient in train and test!

In [13]:
TRAIN_DATA_SIZE = 5000 # so that one epoch last 30 sec without Connect

### ML Challenge
- Train: hospital A of C16 dataset
- Test: Mix of C 16 A et B + C17 D et E. We might take only a subset a C16 as C17 is small compared to C16.


In [14]:
C16_patients = df_16.patient.unique()
C16_patients_train, C16_patients_test = np.array_split(C16_patients, 2)

In [15]:
train_ml_df = df_16.loc[(df_16.hospital == 0) & df_16.patient.isin(C16_patients_train),:].sample(n=TRAIN_DATA_SIZE, random_state=0) # C16 A, TODO GroupSplit

In [16]:
train_ml_df

Unnamed: 0,path,slide,hospital,target,camelyon_version,absolute_path,patient
7662,normal_0_73_1076.tif,Normal_073,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,73
30468,normal_0_33_6177.tif,Normal_033,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,33
82083,normal_0_7_1293.tif,Normal_007,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,7
10723,normal_0_8_0494.tif,Normal_008,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,8
72330,normal_0_40_6345.tif,Normal_040,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,40
...,...,...,...,...,...,...,...
32605,normal_0_22_3569.tif,Normal_022,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,22
44542,normal_0_81_1647.tif,Normal_081,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,81
56249,normal_0_10_1351.tif,Normal_010,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,10
38780,normal_0_25_4742.tif,Normal_025,0,0,16,/data/tiles_Camelyon16_with_metadata_all/norma...,25


### FL Challenge
- Train: 
    - Noeud A : dataset de train de la partie ML: C16 - hospital A
    - Noeud B: C16 - hospital B
    - Noeud C: C17 A et B
- Test: Mix of C 16 A et B + C17 D et E. We might take only a subset a C16 as C17 is small compared to C16.


In [17]:
train_fl_node_A = train_ml_df.copy(deep=True) # C16 A

In [18]:
train_fl_node_B = df_16.loc[(df_16.hospital == 1) & df_16.patient.isin(C16_patients_train),:].sample(n=TRAIN_DATA_SIZE, random_state=0) # C16 B

In [19]:
train_fl_node_C = df_17.loc[df_17.hospital <= 1,:].sample(n=TRAIN_DATA_SIZE, random_state=0) # C17 A and B

### Test


In [20]:
C16_TEST_DATA_SIZE = 3000
C17_TEST_DATA_SIZE = 3000
TEST_DATA_SIZE = C16_TEST_DATA_SIZE + C17_TEST_DATA_SIZE

In [21]:
df_16_test = df_16.loc[df_16.patient.isin(C16_patients_test),:].sample(n=C16_TEST_DATA_SIZE, random_state=0) # C16 A and B

In [22]:
df_17_test = df_17.loc[(df_17.hospital == 3) | (df_17.hospital == 4),:].sample(n=C17_TEST_DATA_SIZE, random_state=0) # C17 D and E

In [23]:
df_test = pd.concat([df_16_test,df_17_test])

## Quality check

In [24]:
assert(len(train_fl_node_A) == TRAIN_DATA_SIZE)
assert(len(train_fl_node_B) == TRAIN_DATA_SIZE)
assert(len(train_fl_node_C) == TRAIN_DATA_SIZE)

assert(len(df_test) == TEST_DATA_SIZE)

In [25]:
assert(train_fl_node_A.hospital.unique() == [0])
assert(train_fl_node_B.hospital.unique() == [1])
assert((train_fl_node_C.hospital.unique() == np.array([0,1])).all())

In [26]:
assert(len(set(train_fl_node_A.path).intersection(set(df_test.path))) == 0)
assert(len(set(train_fl_node_B.path).intersection(set(df_test.path))) == 0)
assert(len(set(train_fl_node_C.path).intersection(set(df_test.path))) == 0)

## Move data
What we will do:
- transform data into JPG so that it can be easily read by Keras
- save the data in the different folders:
    - ML
        - train
            - target 0
            - target 1
    - FL
        - train node A
            - target 0
            - target 1
        - train node B
            - target 0
            - target 1
        - train node C
            - target 0
            - target 1
    - test
        - target 0
        - target 1


### Creating target folders

In [27]:
df_list = [train_ml_df, train_fl_node_A, train_fl_node_B, train_fl_node_C, df_test]

In [28]:
data_challenge_folder = data_folder / "challenges_data"
# Cleaning up
#shutil.rmtree(str(data_challenge_folder))

In [29]:
destination_folders = [data_challenge_folder / "ML" / "train",
                       data_challenge_folder / "FL" / "train_node_A",
                       data_challenge_folder / "FL" / "train_node_B",
                       data_challenge_folder / "FL" / "train_node_C",
                       data_challenge_folder / "test",
                      ]

for folder in destination_folders:
    target_0_folder = folder / "target_0"
    target_1_folder = folder / "target_1"
    
    target_0_folder.mkdir(exist_ok=True, parents=True)
    target_1_folder.mkdir(exist_ok=True, parents=True)

In [30]:
# adding target path into each dataframe

for df, destination_folder in zip(df_list, destination_folders):
    print(destination_folder.name) 
    df["target_folder"] = ""
    
    for index, row in tqdm(df.iterrows()):
        if row.target == 1:
            df.at[index, "target_folder"] = destination_folder / "target_1"
        elif row.target == 0:
            df.at[index, "target_folder"] = destination_folder / "target_0"
        else:
             raise Exception("target is not 0 or 1")

train


5000it [00:00, 7411.98it/s]


train_node_A


5000it [00:00, 7459.59it/s]


train_node_B


5000it [00:00, 7559.50it/s]


train_node_C


5000it [00:00, 7364.88it/s]


test


6000it [00:00, 7847.15it/s]


### Copying files in JPG into folders

In [31]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

def save_into_jpg(source_path, destination_folder):
    im = Image.open(source_path)
    im.thumbnail(im.size)
    
    out_file_name = source_path.stem + '.jpg'
    out_file_path = destination_folder / out_file_name

    im.save(out_file_path, "JPEG", quality=100)

In [32]:
for df in df_list:
    for index, row in tqdm(df.iterrows()):
        save_into_jpg(row.absolute_path, row.target_folder)

5000it [00:41, 119.79it/s]
5000it [00:28, 177.23it/s]
5000it [00:40, 124.64it/s]
5000it [00:43, 115.20it/s]
6000it [00:42, 142.22it/s]


### Move data for Substra

In [33]:
substra_data_folder = data_challenge_folder / "substra"
substra_data_folder.mkdir(exist_ok=True, parents=True)

In [34]:
from distutils.dir_util import copy_tree

In [35]:
# node A
target_dir = Path(substra_data_folder / "node_A" / "train" / "data_sample_0")
target_dir.mkdir(exist_ok=True, parents=True)

source_dir = data_challenge_folder / "FL" / "train_node_A" / "target_0"
result = copy_tree(str(source_dir), str(target_dir))

source_dir = data_challenge_folder / "FL" / "train_node_A" / "target_1"
result = copy_tree(str(source_dir), str(target_dir))

In [36]:
# node B
target_dir = Path(substra_data_folder / "node_B" / "train" / "data_sample_0")
target_dir.mkdir(exist_ok=True, parents=True)

source_dir = data_challenge_folder / "FL" / "train_node_B" / "target_0"
result = copy_tree(str(source_dir), str(target_dir))

source_dir = data_challenge_folder / "FL" / "train_node_B" / "target_1"
result = copy_tree(str(source_dir), str(target_dir))

In [37]:
# node C - train
target_dir = Path(substra_data_folder / "node_C" / "train" / "data_sample_0")
target_dir.mkdir(exist_ok=True, parents=True)

source_dir = data_challenge_folder / "FL" / "train_node_C" / "target_0"
result = copy_tree(str(source_dir), str(target_dir))

source_dir = data_challenge_folder / "FL" / "train_node_C" / "target_1"
result = copy_tree(str(source_dir), str(target_dir))

In [38]:
# node C - test
target_dir = Path(substra_data_folder / "node_C" / "test" / "data_sample_0")
target_dir.mkdir(exist_ok=True, parents=True)

source_dir = data_challenge_folder / "test" / "target_0"
result = copy_tree(str(source_dir), str(target_dir))

source_dir = data_challenge_folder / "test" / "target_1"
result = copy_tree(str(source_dir), str(target_dir))