In [1]:
import glob
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
import re
from sklearn import model_selection 
from file_names_2_df_patient_id import file_names_2_df_patient

In [8]:
def split_patients(bcc_annotation):
    random_state = 20229
    bcc_codes = list(bcc_annotation['bcc code'])
    patient_id = list(set([bcc for bcc_code in bcc_codes for bcc in re.findall('^BCC\d+', bcc_code) if len(bcc)!=0]))
    
    kf = model_selection.KFold(n_splits=5)
    for train, test in kf.split(patient_id):
        print("%s %s" % (train, test))
        
    X_train, X_test = model_selection.train_test_split(pd.DataFrame(patient_id), test_size=0.1, random_state=random_state)
    return X_train, X_test


def get_file_names_patient(bcc_code):
    bcc_files = []
    for file in glob.glob('/workspace/data/bcc-projekt-digital/bcc/*/*'):
        bcc_files.append(file.split('/')[-1])
        
    bcc_set=[]
    for file in tqdm(bcc_files):
        for xt in list(bcc_code['names']):
            if xt == file.split(' ')[0]: 
                bcc_set.append(file)
    return bcc_set


def annotation_binary_patient(X_test_code, file_save_name):
    bcc_annotation = file_names_2_df_patient(path, get_file_names_patient(X_test_code))

    bcc_annotation.to_csv('patient_annotation.csv')
    display(bcc_annotation)
    
    all_bcc = glob.glob('/workspace/data/cv_methods/tmi2022/WSI/*')
    folders = []
    for bcc in all_bcc:
        if bcc.endswith('.dzi'):pass
        else: folders.append(bcc)

    wsi_labels = []
    for folder in folders:
        f = folder.replace('_files','').split('/')[-1]

        try:
            label = float(bcc_annotation[bcc_annotation['names'] == f + '.ndpi']['label'])
            wsi_labels.append((folder.split('/')[-1] + '/20.0', int(label)))
        except Exception as e:
            #print(e, folder)
            pass
    
    wsi_label_df = pd.DataFrame(wsi_labels, columns=['X', 'y'] )
     
    X = wsi_label_df['X']
    y = wsi_label_df['y'].astype(int)
   
    y_ = [1 if yt!=0 else 0 for yt in list(y)]
    
    data_set = pd.DataFrame()
    data_set['X'] = X
    data_set['y'] = y_


    graph_trans_script_path = '/workspace/data/cv_methods/tmi2022/scripts/'
    data_set.sort_values(by=['X'], inplace=True)
    data_set.to_csv(graph_trans_script_path + f'{file_save_name}', sep='\t', index=False, header=False)
    
    return data_set

In [9]:
bcc_annotation = pd.read_csv('bcc_annotations.csv', delimiter=';')

In [10]:
path = '/workspace/data/bcc-projekt-digital/bcc/*/'
X_train_code, X_test_code = split_patients(bcc_annotation)
X_train_code.columns, X_test_code.columns = ['names'],['names']
df_train = annotation_binary_patient(X_train_code, file_save_name='train_set.txt')
df_test = annotation_binary_patient (X_test_code,  file_save_name='val_set.txt')

[ 31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48
  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66
  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84
  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102
 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
 139 140 141 142 143 144 145 146 147 148 149 150] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  61  62  63  64  65
  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83
  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 13

100%|██████████| 507/507 [00:00<00:00, 34646.16it/s]

['BCC222 (1', '1b']
['BCC222 (2', '1b']
['BCC223 (1', '1b']
['BCC225 (4', '1b']
['BCC226 (3', '1b']
['BCC226 (4', '1b']
['BCC227 (2', '1b']
['BCC228 (1', '1b']
['BCC228 (2', '1b']
['BCC228 (3', '1b']
['BCC229 (1', '1b']
['BCC23 (2', '1b']
['BCC230 (1', '1b']
['BCC230 (2', '1b']
['BCC230 (6', '1b']
['BCC230 (7', '1b']
['BCC230 (8', '1b']
['BCC231 (1', '1b']
['BCC232 (3', '1b']
['BCC232 (4', '1b']
['BCC232 (5', '1b']
['BCC232 (6', '1b']
['BCC232 (7', '1b']
['BCC233 (1', '1b']
['BCC237 (1', '1b']
['BCC237 (2', '1b']
['BCC237 (7', '1b']
['BCC237 (8', '1b']
['BCC239 (10', '1b']
['BCC239 (2', '1b']
['BCC239 (6', '1b']
['BCC239 (7', '1b']
['BCC239 (9', '1b']
['BCC240 (1', '1b']
['BCC240 (3', '1b']
['BCC241 (1', '1b']
['BCC241 (2', '1b']
['BCC242 (2', '1b']
['BCC242 (3', '1b']
['BCC242 (5', '1b']
['BCC242 (6', '1b']
['BCC242 (7', '1b']
['BCC243 (3', '1b']
['BCC244 (5', '1b']
['BCC244 (6', '1b']
['BCC245 (7', '1b']
['BCC245 (8', '1b']
['BCC245 (9', '1b']
['BCC37 (2', '1b']
['BCC37 (3', '1b']
['




Unnamed: 0,bcc code,address,names,temp label,label
0,BCC18 -6,BCC18 (6)_0.ndpi,BCC18 (6)_0.ndpi,0,0
1,BCC1 -1,BCC1 (1)_0.ndpi,BCC1 (1)_0.ndpi,0,0
2,BCC1 -4,BCC1 (4)_0.ndpi,BCC1 (4)_0.ndpi,0,0
3,BCC11 -1,BCC11 (1)_0.ndpi,BCC11 (1)_0.ndpi,0,0
4,BCC11 -4,BCC11 (4)_0.ndpi,BCC11 (4)_0.ndpi,0,0
...,...,...,...,...,...
445,BCC677 -4,BCC677 (4)_3.ndpi,BCC677 (4)_3.ndpi,3,4
446,BCC677 -5,BCC677 (5)_3.ndpi,BCC677 (5)_3.ndpi,3,4
447,BCC677 -6,BCC677 (6)_3.ndpi,BCC677 (6)_3.ndpi,3,4
448,BCC677 -7,BCC677 (7)_3.ndpi,BCC677 (7)_3.ndpi,3,4


100%|██████████| 507/507 [00:00<00:00, 114218.08it/s]

['BCC235 (1', '1b']
['BCC235 (2', '1b']
['BCC236 (2', '1b']
['BCC45 (2', '1b']





Unnamed: 0,bcc code,address,names,temp label,label
0,BCC14 -1,BCC14 (1)_0.ndpi,BCC14 (1)_0.ndpi,0,0
1,BCC14 -3,BCC14 (3)_0.ndpi,BCC14 (3)_0.ndpi,0,0
2,BCC221 -1,BCC221 (1)_0.ndpi,BCC221 (1)_0.ndpi,0,0
3,BCC221 -2,BCC221 (2)_0.ndpi,BCC221 (2)_0.ndpi,0,0
4,BCC221 -4,BCC221 (4)_0.ndpi,BCC221 (4)_0.ndpi,0,0
5,BCC221 -5,BCC221 (5)_0.ndpi,BCC221 (5)_0.ndpi,0,0
6,BCC234 -1,BCC234 (1)_0.ndpi,BCC234 (1)_0.ndpi,0,0
7,BCC235 -3,BCC235 (3)_0.ndpi,BCC235 (3)_0.ndpi,0,0
8,BCC236 -1,BCC236 (1)_0.ndpi,BCC236 (1)_0.ndpi,0,0
9,BCC236 -3,BCC236 (3)_0.ndpi,BCC236 (3)_0.ndpi,0,0


In [5]:
def get_label_percentage(data):
    return data['y'].value_counts()/data.shape[0]

In [6]:
get_label_percentage(df_test)

0    0.526316
1    0.473684
Name: y, dtype: float64

In [7]:
kf = model_selection.KFold(n_splits=2)
for train, test in kf.split(patient_id):
    print("%s %s" % (train, test))

NameError: name 'patient_id' is not defined