In [76]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from natsort import natsorted

thispath = Path.cwd().resolve()

datadir = Path(thispath.parent / "data")
reportsdir = Path(datadir / "csv_folder" / "reports")

reports_path = natsorted([i for i in reportsdir.rglob("*.xls") if "Lung" in str(i)])

dfs = []
for report in reports_path:
    report = pd.read_excel(report, index_col=0, usecols=["FILENAME", "NATOIL"],
                           dtype={"FILENAME": str, "NATOIL": str})
    dfs.append(report)
    
reports = pd.concat(dfs)
reports.sort_values("NATOIL", inplace=True)

temp = reports.iloc[0][0]
id_list = []
count = 0
for index, row in reports.iterrows():
    if row[0] != temp:
        count += 1
        temp = row[0]
    id_list.append(str(count).zfill(3))

reports["ID"] = id_list
reports.drop("NATOIL", axis=1, inplace=True)
print(reports)
reports.to_csv(datadir / "patients_ID.csv")

                     ID
FILENAME               
000030069800299917  000
000030069800301408  000
000030069800301406  000
000033385500476542  001
000033340300479660  002
...                 ...
000032301600414125  223
000032301600414861  223
000032301600414863  223
000032302500414881  223
000032301600414865  223

[2087 rows x 1 columns]


In [142]:
import numpy as np

datadir = Path(thispath.parent / "data")
k = 5

csv_ids = Path(datadir / "patients_ID.csv")
csv_dataset_AOEC = Path(datadir / "labels.csv")

# read data
dataset_AOEC = pd.read_csv(csv_dataset_AOEC,
                            sep=',', 
                            index_col=0, 
                            dtype={"image_num":str})

patients_id = pd.read_csv(csv_ids,
                            sep=',', 
                            index_col=0, 
                            dtype={"FILENAME": str, "ID": str})

df = patients_id.drop_duplicates(subset='ID', keep="first")
patients = df.values

folds = create_folds(patients, k)
header = ["images_train", "images_validation", "labels_train", "labels_validation"]
folds_dataset = pd.DataFrame(columns=header)

for i in range(k):
    train_patients = folds[:i] + folds[i+1:]
    train_patients = [item for sublist in train_patients for item in sublist]
    train_patients = [item for sublist in train_patients for item in sublist]
    validation_patinets = folds[i]
    validation_patinets = [item for sublist in validation_patinets for item in sublist]

    train_filenames = patients_id[patients_id['ID'].isin(train_patients)].index
    validation_filenames = patients_id[patients_id['ID'].isin(validation_patinets)].index
    train = dataset_AOEC[dataset_AOEC.index.isin(train_filenames)]
    validation = dataset_AOEC[dataset_AOEC.index.isin(validation_filenames)]

    images_train = train.index.to_list()
    labels_train = train.values.tolist()
    images_validation = validation.index.to_list()
    labels_validation = validation.values.tolist()

    folds_dataset.loc[i] = [images_train, images_validation, labels_train, labels_validation]

    print(f"Number WSI TRAIN: {len(images_train)}, Number WSI VALID: {len(images_validation)}")
    print(f"Datasplit labels TRAIN: {np.sum(labels_train, axis=0)}, "
        f"Datasplit labels TEST: {np.sum(labels_validation, axis=0)}")

folds_dataset.index.name = "fold"
folds_dataset.to_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"))

print(f"{k}_fold_crossvalidation_data_split.csv in {datadir}")



Number WSI TRAIN: 1069, Number WSI VALID: 247
Datasplit labels TRAIN: [187 560 194 293], Datasplit labels TEST: [ 45 128  57  46]
Number WSI TRAIN: 1038, Number WSI VALID: 278
Datasplit labels TRAIN: [184 535 198 270], Datasplit labels TEST: [ 48 153  53  69]
Number WSI TRAIN: 1018, Number WSI VALID: 298
Datasplit labels TRAIN: [171 545 184 264], Datasplit labels TEST: [ 61 143  67  75]
Number WSI TRAIN: 1060, Number WSI VALID: 256
Datasplit labels TRAIN: [167 611 182 230], Datasplit labels TEST: [ 65  77  69 109]
Number WSI TRAIN: 1079, Number WSI VALID: 237
Datasplit labels TRAIN: [219 501 246 299], Datasplit labels TEST: [ 13 187   5  40]
5_fold_crossvalidation_data_split.csv in /home/lluis/histo_lung/data


In [None]:
import statistics as stat
"""
Autov2 1296
Number WSI TRAIN: 1049, Number WSI VALID: 247
Datasplit labels TRAIN: [ 35 602 285 184], Datasplit labels TEST: [ 16 139  85  22]
Number WSI TRAIN: 1018, Number WSI VALID: 278
Datasplit labels TRAIN: [ 44 581 292 169], Datasplit labels TEST: [  7 160  78  37]
Number WSI TRAIN: 1005, Number WSI VALID: 291
Datasplit labels TRAIN: [ 33 581 268 165], Datasplit labels TEST: [ 18 160 102  41]
Number WSI TRAIN: 1040, Number WSI VALID: 256
Datasplit labels TRAIN: [ 48 647 276 126], Datasplit labels TEST: [ 3 94 94 80]
Number WSI TRAIN: 1072, Number WSI VALID: 224
Datasplit labels TRAIN: [ 44 553 359 180], Datasplit labels TEST: [  7 188  11  26]

SCLC train: 40.8 ± 6.457553716385176
LUAD train: 592.8 ± 34.94567212116545
LUSC train: 296 ± 36.36619309193636
NL train: 164.8 ± 23.03692687838376
SCLC valid: 10.2 ± 6.457553716385176
LUAD valid: 148.2 ± 34.94567212116545
LUSC valid: 74 ± 36.36619309193636
NL valid: 41.2 ± 23.03692687838376

Manual 1226
Number WSI TRAIN: 1002, Number WSI VALID: 223
Datasplit labels TRAIN: [ 37 493 268 211], Datasplit labels TEST: [ 16 108  85  26]
Number WSI TRAIN: 959, Number WSI VALID: 266
Datasplit labels TRAIN: [ 46 472 279 180], Datasplit labels TEST: [  7 129  74  57]
Number WSI TRAIN: 950, Number WSI VALID: 275
Datasplit labels TRAIN: [ 35 470 270 188], Datasplit labels TEST: [ 18 131  83  49]
Number WSI TRAIN: 976, Number WSI VALID: 249
Datasplit labels TRAIN: [ 48 529 251 167], Datasplit labels TEST: [  5  72 102  70]
Number WSI TRAIN: 1013, Number WSI VALID: 212
Datasplit labels TRAIN: [ 46 440 344 202], Datasplit labels TEST: [  7 161   9  35]

SCLC train: 42.4 ± 5.941380311005179
LUAD train: 480.8 ± 32.90440699967103
LUSC train: 282.4 ± 35.892896233098824
NL train: 189.6 ± 17.444196742756603
SCLC valid: 10.6 ± 5.941380311005179
LUAD valid: 120.2 ± 32.90440699967103
LUSC valid: 70.6 ± 35.892896233098824
NL valid: 47.4 ± 17.444196742756603
"""

sclc_train = [37, 46, 35, 48, 46]
luad_train = [493, 472, 470, 529, 440]
lusc_train = [268, 279, 270, 251, 344]
nl_train = [211, 180, 188, 167, 202]

sclc_valid = [16, 7, 18, 5, 7]
luad_valid = [108, 129, 131, 72, 161]
lusc_valid = [85, 74, 83, 102, 9]
nl_valid = [26, 57, 49, 70, 35]

mean_sclc_train = stat.mean(sclc_train)
mean_luad_train = stat.mean(luad_train)
mean_lusc_train = stat.mean(lusc_train)
mean_nl_train = stat.mean(nl_train)

std_sclc_train = stat.stdev(sclc_train)
std_luad_train = stat.stdev(luad_train)
std_lusc_train = stat.stdev(lusc_train)
std_nl_train = stat.stdev(nl_train)

mean_sclc_valid = stat.mean(sclc_valid)
mean_luad_valid = stat.mean(luad_valid)
mean_lusc_valid = stat.mean(lusc_valid)
mean_nl_valid = stat.mean(nl_valid)

std_sclc_valid = stat.stdev(sclc_valid)
std_luad_valid = stat.stdev(luad_valid)
std_lusc_valid = stat.stdev(lusc_valid)
std_nl_valid = stat.stdev(nl_valid)

print(f"SCLC train: {mean_sclc_train} \u00B1 {std_sclc_train}")
print(f"LUAD train: {mean_luad_train} \u00B1 {std_luad_train}")
print(f"LUSC train: {mean_lusc_train} \u00B1 {std_lusc_train}")
print(f"NL train: {mean_nl_train} \u00B1 {std_nl_train}")

print(f"SCLC valid: {mean_sclc_valid} \u00B1 {std_sclc_valid}")
print(f"LUAD valid: {mean_luad_valid} \u00B1 {std_luad_valid}")
print(f"LUSC valid: {mean_lusc_valid} \u00B1 {std_lusc_valid}")
print(f"NL valid: {mean_nl_valid} \u00B1 {std_nl_valid}")