In [2]:
import pandas as pd
import numpy as np
import pickle
import copy
import math
import os

In [9]:
def store_data(data, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

def get_frac_split(meta_df, matching_field, ind_column, num_folds=5):
    # Copy dataframe.
    df = meta_df.copy(deep=True)

    # Get unique classes.
    unique_classes = np.unique(meta_df[ind_column])
    # randomize rows
    df = df.sample(frac=1).reset_index(drop=True)

    folds          = dict()
    for i in range(num_folds):
        folds[i] = dict()
        folds[i]['train'] = list()
        folds[i]['test']  = list()

    for class_ in unique_classes:
        # Get slides for class.
        slides      = np.unique(df[df[ind_column]==class_][matching_field].values)

        # Test size.
        num_samples = len(slides)
        test_size   = math.floor(num_samples*(1/num_folds))

        # Iterate through chunks and add samples to fold.
        for i in range(num_folds):
            test_sample  = slides[i*test_size:(i+1)*test_size]
            train_sample = list(set(slides).difference(set(test_sample)))
            folds[i]['train'].extend(train_sample)
            folds[i]['test'].extend(test_sample)

    return folds

def get_folds(meta_df, matching_field, ind_column, num_folds=5, valid_set=False):

    # Get initial split for train/test.
    folds = get_frac_split(meta_df, matching_field, ind_column, num_folds=num_folds)

    for i in range(num_folds):
        whole_train_samples = folds[i]['train']
        subset_df = meta_df[meta_df[matching_field].isin(whole_train_samples)]
        train_val_folds = get_frac_split(subset_df, matching_field, ind_column, num_folds=num_folds)
        del folds[i]['train']
        folds[i]['train'] = train_val_folds[0]['train']
        folds[i]['valid'] = train_val_folds[0]['test']

    return folds

# Verify: This should all be empty.
def sanity_check_overlap(folds, num_folds):
    # For each fold, no overlap between cells.
    for i in range(num_folds):
        result = set(folds[i]['train']).intersection(set(folds[i]['valid']))
        if len(result) > 0:
            print(result)

        result = set(folds[i]['train']).intersection(set(folds[i]['test']))
        if len(result) > 0:
            print(result)

        result = set(folds[i]['valid']).intersection(set(folds[i]['test']))
        if len(result) > 0:
            print(result)

        # No overlap between test sets of all folds.
        for i in range(num_folds):
            for j in range(num_folds):
                if i==j: continue
                result = set(folds[i]['test']).intersection(set(folds[j]['test']))
                if len(result) > 0:
                    print('Fold %s-%s' % (i,j), result)

# Fit for legacy code.
def fit_format(folds):
    slides_folds = dict()
    for i, fold in enumerate(folds):
        slides_folds[i] = dict()
        slides_folds[i]['train'] = [(slide, None, None) for slide in folds[i]['train']]
        slides_folds[i]['valid'] = [(slide, None, None) for slide in folds[i]['valid']]
        slides_folds[i]['test']  = [(slide, None, None) for slide in folds[i]['test']]

    return slides_folds


In [12]:
import h5py
dataset = "acmeso"
main_path = '/mnt/cephfs/sharedscratch/users/fshahi/Projects/Histomorphological-Phenotype-Learning'
h5_complete_path   = '{}/results/BarlowTwins_3/{}/h224_w224_n3_zdim128/hdf5_{}_he_complete.h5'.format(main_path, dataset, dataset)
meta_df = pd.DataFrame()
with h5py.File(h5_complete_path, 'r') as f:
    print('Keys:', list(f.keys()))
    for key in f.keys():
        if key in ['tiles', 'samples']:
            meta_df[key] = f[key][:].astype(str)
meta_df['case_Id'] = meta_df['samples']
meta_df

Keys: ['hist_subtype', 'img_h_latent', 'img_z_latent', 'indexes', 'labels', 'original_set', 'patterns', 'samples', 'slides', 'tiles']


Unnamed: 0,samples,tiles,case_Id
0,D.17.0058471.C_A1_1,65_20.jpeg,D.17.0058471.C_A1_1
1,D.17.0072002A_A1_1,21_2.jpeg,D.17.0072002A_A1_1
2,D.19.0075850.L_A1_1,31_14.jpeg,D.19.0075850.L_A1_1
3,SP18_06734_B1_1,78_23.jpeg,SP18_06734_B1_1
4,SP-18-14225_A1_1,18_37.jpeg,SP-18-14225_A1_1
...,...,...,...
26425,PH19_14147_J3_1,5_25.jpeg,PH19_14147_J3_1
26426,D.17.0092133J_A2_1,34_32.jpeg,D.17.0092133J_A2_1
26427,D.17.0092133J_A2_1,44_33.jpeg,D.17.0092133J_A2_1
26428,D.17.0092133J_A2_1,34_27.jpeg,D.17.0092133J_A2_1


In [3]:
txt_file = '/nfs/home/users/fshahi/Projects/Histomorphological-Phenotype-Learning/files/acmeso_jpeg_names.txt'
pickle_path = '/nfs/home/users/fshahi/Projects/Histomorphological-Phenotype-Learning/files/pkl_acmeso.pkl'
def fun(x):
    x = x.replace('train_', '').replace('valid_', '').replace('test_', '').replace('.jpeg', '')
    tile_part = len('_'+ x.split('_')[-2] + '_' + x.split('_')[-1])
    # x = x.replace(tile_part, '')
    x = x[:-tile_part]
    return x

meta_df = pd.read_csv(txt_file, sep='\t', header=None, names=['tiles'])
meta_df['case_Id'] = meta_df['tiles'].apply(fun)
meta_df['samples'] = meta_df['case_Id']
meta_df
# meta_df.to_csv('/nfs/home/users/fshahi/Projects/Histomorphological-Phenotype-Learning/files/acmeso_jpeg_names.csv', index=False)

Unnamed: 0,tiles,case_Id,samples
0,test_D.19.0057971.Q_A1_1_10_10.jpeg,D.19.0057971.Q_A1_1,D.19.0057971.Q_A1_1
1,test_D.19.0057971.Q_A1_1_10_11.jpeg,D.19.0057971.Q_A1_1,D.19.0057971.Q_A1_1
2,test_D.19.0057971.Q_A1_1_10_12.jpeg,D.19.0057971.Q_A1_1,D.19.0057971.Q_A1_1
3,test_D.19.0057971.Q_A1_1_10_13.jpeg,D.19.0057971.Q_A1_1,D.19.0057971.Q_A1_1
4,test_D.19.0057971.Q_A1_1_10_14.jpeg,D.19.0057971.Q_A1_1,D.19.0057971.Q_A1_1
...,...,...,...
17057,valid_H16_37130_A3_1_9_33.jpeg,H16_37130_A3_1,H16_37130_A3_1
17058,valid_H16_37130_A3_1_9_34.jpeg,H16_37130_A3_1,H16_37130_A3_1
17059,valid_H16_37130_A3_1_9_35.jpeg,H16_37130_A3_1,H16_37130_A3_1
17060,valid_H16_37130_A3_1_9_36.jpeg,H16_37130_A3_1,H16_37130_A3_1


In [7]:
main_path = '/mnt/cephfs/sharedscratch/users/fshahi/Projects/Histomorphological-Phenotype-Learning'
meta_csv   = '%s/files/Meso_patients.csv' % main_path
pickle_path = '%s/pkl_Meso.pkl' % main_path
# Read meta data file, rename column.
meta_df  = pd.read_csv(meta_csv)

In [13]:

folds       = get_folds(meta_df, matching_field='samples', ind_column='case_Id', num_folds=5, valid_set=True)
final_folds = fit_format(folds)

# If no output, all good.
sanity_check_overlap(folds, num_folds=5)

store_data(final_folds, pickle_path)

In [17]:

# reading pkl file
def read_pkl(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data
read_pkl(pickle_path)

{0: {'train': [('09-19001_1', None, None),
   ('15_14562_B_1', None, None),
   ('16-28442_A06_1', None, None),
   ('16_40778D_1,_D.17.0058471.C_A1_1', None, None),
   ('17-04738_A_1', None, None),
   ('17HSO2345J_D-PLEX_1', None, None),
   ('17HSOO575Y_(_)B_1,_SP18_06734_B1_1', None, None),
   ('17_19022_A_1', None, None),
   ('18_18670_1', None, None),
   ('6L_19_30694_A2_1', None, None),
   ('A1926728-1_1', None, None),
   ('B19_005351_1', None, None),
   ('D.16.0012075.H_A1_1,_D.17.001_7546W_1', None, None),
   ('D.16.0026763_M_A1_1', None, None),
   ('D.16.0028547.R_A1_1', None, None),
   ('D.17.0029487.F_A2_1', None, None),
   ('D.17.0072002A_A1_1', None, None),
   ('D.17.0090392.X_A1_1,_D.17.0029069.B_A1_1', None, None),
   ('D.17.0092133J_A2_1', None, None),
   ('D.18.0011714.K_A1_1', None, None),
   ('D.18.0023594.F_A1_1', None, None),
   ('D.18.0068083W_A1_1', None, None),
   ('D.18.0077688.L_A1_1,_SP18-15315_A2_1', None, None),
   ('D.19.0057971.Q_A1_1', None, None),
   ('D.1