In [2]:
import pandas as pd
import numpy as np
import os
import os.path as osp
from pathlib import Path
from collections import Counter, defaultdict

In [None]:
internal_data_pths = ['./datasets/LN_classify/Fudan_HN_LN_22-23_all/Fudan_HN_LN_220528_patches',
                      './datasets/LN_classify/Fudan_HN_LN_22-23_all/Fudan_HN_LN_230508_patches',
                      './datasets/LN_classify/Fudan_HN_LN_22-23_all/Fudan_HN_LN_20231204_patches'
                      ]

external_data_pths = ['./datasets/LN_classify/Fudan_HN_LN_22-23_all/CGMH/CGMH_2024_patches',
                      './datasets/LN_classify/Fudan_HN_LN_22-23_all/CGMH_Oral/CGMH_Oral_patches',
                      './datasets/LN_classify/Fudan_HN_LN_22-23_all/TCGA/TCGA-HNSC_selected_patches'
                      ]

In [4]:
ene_list = set()
for data_pth in internal_data_pths:
    for f in Path(data_pth).rglob('*_mask.nii.gz'):
        if '_pos_ENE' in str(f):
            basename = osp.basename(str(f)).replace("_pos_ENE_mask.nii.gz", "")
            ene_list.add(basename)

for data_pth in external_data_pths:
    for f in Path(data_pth).rglob('*_mask.nii.gz'):
        if '_pos_ENE' in str(f):
            basename = osp.basename(str(f)).replace("_pos_ENE_mask.nii.gz", "")
            ene_list.add(basename)

#### Internal dev data

In [5]:
# find unmatched clinical standard labeled data
patient_id = set()

# load the earliest data
fudan_220528 = pd.read_csv(osp.join(internal_data_pths[0], "cropping_list.csv"))
# load name matching record
name_matching = pd.read_csv('pathology_report.csv')
# select record with ZS file name
print("{} record with ZS file name".format(len(name_matching)))

# iter through rows to change name to ZS style
for index, row in fudan_220528.iterrows():
    basename = row['basename'].split('_')[0]
    suffix = row['basename'].split('_')[1]
    if row['path'] == 'bl3':
        basename = 'bl3-' + basename
    # has matched ZS name
    if len(name_matching[name_matching['id'] == basename]) > 0:
        continue
    else:
        patient_id.add(row['path'] + '/' + basename)

patient_id = sorted(list(patient_id))
# for p in patient_id:
#     print(p)

38 record with ZS file name


In [6]:
# load the earliest data
fudan_220528 = pd.read_csv(osp.join(internal_data_pths[0], "cropping_list.csv"))
# load name matching record
name_matching = pd.read_csv('pathology_report.csv')
# select record with ZS file name
print("{} record with ZS file name".format(len(name_matching)))

# iter through rows to change name to ZS style
for index, row in fudan_220528.iterrows():
    basename = row['basename'].split('_')[0]
    suffix = row['basename'].split('_')[1]
    if row['path'] == 'bl3':
        basename = 'bl3-' + basename
    # has matched ZS name
    if len(name_matching[name_matching['id'] == basename]) > 0:
        if len(name_matching[name_matching['id'] == basename]) > 1:
            print("Warning: multiple records found for {}".format(basename))
        new_name = name_matching[name_matching['id'] == basename]['file'].values[0].replace('.nii.gz', '') + '_' + suffix
        fudan_220528.at[index, 'basename'] = new_name
# fudan_220528 = fudan_220528[fudan_220528['basename'].str.startswith('ZS')]
fudan_220528 = fudan_220528.loc[fudan_220528['basename'].str.startswith(("ZS"))]

38 record with ZS file name


In [7]:
# append 2nd and 3rd data
fudan230508 = pd.read_csv('Fudan_230508_cropping_list.csv')
fudan231204 = pd.read_csv('Fudan_231204_cropping_list.csv')
fudan231204 = fudan231204.loc[fudan231204['path'].str.startswith(("Int"))]
all_internal = fudan_220528.append(fudan230508).append(fudan231204)
all_internal

Unnamed: 0,basename,path,recist,bbox_sz,label
51,ZS18093376_ins1,116t,9.264485,"(19, 26, 1)",0
52,ZS18093376_ins2,116t,20.819342,"(53, 59, 12)",0
53,ZS18093376_ins3,116t,7.324219,"(24, 22, 5)",0
54,ZS18093376_ins4,116t,5.567263,"(18, 13, 2)",0
55,ZS18093376_ins5,116t,4.632242,"(16, 15, 5)",0
...,...,...,...,...,...
1309,ZS23143023_1_ins5,Int/zs2,5.879685,"(12, 10, 2)",0
1310,ZS23143023_1_ins6,Int/zs2,4.606436,"(11, 9, 1)",0
1311,ZS23143023_1_ins7,Int/zs2,5.190501,"(10, 9, 1)",0
1312,ZS23143023_1_ins8,Int/zs2,5.694289,"(11, 14, 1)",0


In [8]:
neg_lns, meta_lns, ene_lns = [], [], []
Tstage, Nstage = [], []
region = []

plvl_neg, plvl_meta, plvl_ene = {}, {}, {}
plvl_Tstage, plvl_Nstage = {}, {}
plvl_region = {}

internal_metadata = pd.read_csv('AI_internal.csv')

for idx, row in all_internal.iterrows():
    pid = row['basename'].split('_')[0]  # ZSxxxx_1
    pid = pid.split('-')[0]  # ZSxxxx-1
    if pid[-1].isalpha():    # ZSxxxxa
        pid = pid[:-1]    

    meta_row = internal_metadata[internal_metadata['ID'].str.rstrip(' ') == pid]
    if len(meta_row) == 0:
        # print("Warning: {} not found in internal metadata".format(pid))
        continue
    if len(meta_row) > 1:
        print("Warning: duplicate record found! {}".format(pid))

    if pid not in plvl_neg:
        plvl_neg[pid] = []
    if pid not in plvl_meta:
        plvl_meta[pid] = []
    if pid not in plvl_ene:
        plvl_ene[pid] = []
    if pid not in plvl_Tstage:
        plvl_Tstage[pid] = set()
    if pid not in plvl_Nstage:
        plvl_Nstage[pid] = set()
    if pid not in plvl_region:
        plvl_region[pid] = set()
    
    if row['label'] == 0:
        if float(row['recist']) < 5.0:
            continue # skip small negative LNs
        neg_lns.append(np.round(float(row['recist']), 4))
        plvl_neg[pid].append(np.round(float(row['recist']), 4))
    else:
        if row['basename'] in ene_list:
                ene_lns.append(np.round(float(row['recist']), 4))
                plvl_ene[pid].append(np.round(float(row['recist']), 4))
        else:
            meta_lns.append(np.round(float(row['recist']), 4))
            plvl_meta[pid].append(np.round(float(row['recist']), 4))
    
    Tstage.append(str(meta_row['T'].values[0]))
    Nstage.append(str(meta_row['N'].values[0]))
    region.append(meta_row['Primary Site'].values[0])

    plvl_Tstage[pid].add(str(meta_row['T'].values[0]))
    plvl_Nstage[pid].add(str(meta_row['N'].values[0]))
    plvl_region[pid].add(meta_row['Primary Site'].values[0])
        
Tstage = Counter(Tstage)
Nstage = Counter(Nstage)
region = Counter(region)

for k,v in region.items():
    print("{}:{}".format(k,v))
for k,v in Tstage.items():
    print("T{} {}".format(k,v))
for k,v in Nstage.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(neg_lns)))
print("Positive LN (ENE-) {}".format(len(meta_lns)))
print("Positive LN (ENE+) {}".format(len(ene_lns)))

print("RECIST stats:")
print("Meta (ENE+) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(ene_lns), max(ene_lns), np.mean(ene_lns), np.median(ene_lns))
)
print("Meta (ENE-) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(meta_lns), max(meta_lns), np.mean(meta_lns), np.median(meta_lns))
)
print("Negative RECIST range: {} - {}, mean: {}, median: {}".format(
        min(neg_lns), max(neg_lns), np.mean(neg_lns), np.median(neg_lns))
)
all_lns = neg_lns + meta_lns + ene_lns
all_gt_1cm = [ln >= 10 for ln in all_lns]
print("{}/{} ({}%) LNs SAD >= 1cm".format(sum(all_gt_1cm), len(all_lns), sum(all_gt_1cm) *100 / len(all_lns)))

print("================================================")
print("Patient level statistics:")

plvl_neg_cnt = []
plvl_meta_cnt = []
plvl_ene_cnt = []
plvl_Tstage_cnt = []
plvl_Nstage_cnt = []
plvl_region_cnt = []

assert set(plvl_neg.keys()) == set(plvl_meta.keys()) == set(plvl_ene.keys())
assert set(plvl_ene.keys()) == set(plvl_Tstage.keys()) == set(plvl_Nstage.keys()) == set(plvl_region.keys())

for k,v in plvl_neg.items():  # all the dict has the same keys including all patients
    if len(plvl_meta[k]) == 0 and len(plvl_ene[k]) == 0:
        plvl_neg_cnt.append(len(k))
    else:
        if len(plvl_ene[k]) > 0: # ENE+ patients are not counted in nodal meta ENE- patients
            plvl_ene_cnt.append(len(k))
        else:
            if len(plvl_meta[k]) > 0:
                plvl_meta_cnt.append(len(k))
            else:
                print("Error, patient {} has no positive or ENE LNs".format(k))
    
    if len(plvl_Tstage[k]) > 1:
        print("patient {} has multiple T stages: {}".format(k, plvl_Tstage[k]))
    if len(plvl_Tstage[k]) == 0:
        print("Warning: patient {} has no T stage may due to the RECIST filtering".format(k))
    for item in list(plvl_Tstage[k]):
        plvl_Tstage_cnt.append(item)

    if len(plvl_Nstage[k]) > 1:
        print("patient {} has multiple N stages: {}").format(k, plvl_Nstage[k])
    if len(plvl_Nstage[k]) == 0:
        print("Warning: patient {} has no N stage may due to the RECIST filtering".format(k))
    for item in plvl_Nstage[k]:
        plvl_Nstage_cnt.append(item)

    if len(plvl_region[k]) > 1:
        print("patient {} has multiple primary sites: {}").format(k, plvl_region[k])
    if len(plvl_region[k]) == 0:
        print("Warning: patient {} has no primary site may due to the RECIST filtering".format(k))
    for item in plvl_region[k]:
        plvl_region_cnt.append(item)


plvl_region_cnt = Counter(plvl_region_cnt)
plvl_Tstage_cnt = Counter(plvl_Tstage_cnt)
plvl_Nstage_cnt = Counter(plvl_Nstage_cnt)

for k,v in plvl_region_cnt.items():
    print("{}:{}".format(k,v))
for k,v in plvl_Tstage_cnt.items():
    print("T{} {}".format(k,v))
for k,v in plvl_Nstage_cnt.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(plvl_neg_cnt)))
print("Positive LN (ENE-) {}".format(len(plvl_meta_cnt)))
print("Positive LN (ENE+) {}".format(len(plvl_ene_cnt)))


larynx:589
hypopharynx:422
T2 306
T3 427
T4 248
T1 21
T1a 9
N1 93
N3 517
N0 184
N2b 155
N2a 24
N2c 38
Negative LN 691
Positive LN (ENE-) 226
Positive LN (ENE+) 94
RECIST stats:
Meta (ENE+) RECIST range: 4.8998 - 37.2287, mean: 21.058099999999996, median: 21.60685
Meta (ENE-) RECIST range: 3.5373 - 27.3101, mean: 11.113877876106194, median: 10.10985
Negative RECIST range: 5.0024 - 30.4913, mean: 7.37478046309696, median: 6.4288
281/1011 (27.794263105835807%) LNs SAD >= 1cm
Patient level statistics:
larynx:108
hypopharynx:63
T2 53
T3 73
T4 40
T1 3
T1a 2
N1 14
N3 75
N0 37
N2b 29
N2a 9
N2c 7
Negative LN 49
Positive LN (ENE-) 55
Positive LN (ENE+) 68


#### Renji data

In [25]:
renji_crop_pth = internal_data_pths[-1]
neg_lns, meta_lns, ene_lns = [], [], []
Tstage, Nstage = [], []
region = []

plvl_neg, plvl_meta, plvl_ene = {}, {}, {}
plvl_Tstage, plvl_Nstage = {}, {}
plvl_region = {}

external_metadata = pd.read_csv('AI_external.csv')
cropfile = osp.join(renji_crop_pth, "cropping_list.csv")
df = pd.read_csv(cropfile)
for idx, row in df.iterrows():
    if 'Ext' not in row['path']:
        continue
    if 'XM' in row['path']:
        print("Skipped XM")
        continue
    
    pid = row['basename'].split(' ')[0]

    meta_row = external_metadata[external_metadata['ID'].str.rstrip(' ') == pid]
    if len(meta_row) == 0:
        print("Warning: {} not found in external metadata".format(pid))
        continue

    if pid not in plvl_neg:
        plvl_neg[pid] = []
    if pid not in plvl_meta:
        plvl_meta[pid] = []
    if pid not in plvl_ene:
        plvl_ene[pid] = []
    if pid not in plvl_Tstage:
        plvl_Tstage[pid] = set()
    if pid not in plvl_Nstage:
        plvl_Nstage[pid] = set()
    if pid not in plvl_region:
        plvl_region[pid] = set()

    if row['label'] == 0:
        if float(row['recist']) < 5.0:
            continue # skip small negative LNs
        neg_lns.append(np.round(float(row['recist']), 4))
        plvl_neg[pid].append(np.round(float(row['recist']), 4))
    else:
        if row['basename'] in ene_list:
            ene_lns.append(np.round(float(row['recist']), 4))
            plvl_ene[pid].append(np.round(float(row['recist']), 4))
        else:
            meta_lns.append(np.round(float(row['recist']), 4))
            plvl_meta[pid].append(np.round(float(row['recist']), 4))

    Tstage.append(str(meta_row['T'].values[0]))
    Nstage.append(str(meta_row['N'].values[0]))
    region.append(meta_row['Primary Site'].values[0])

    plvl_Tstage[pid].add(str(meta_row['T'].values[0]))
    plvl_Nstage[pid].add(str(meta_row['N'].values[0]))
    plvl_region[pid].add(meta_row['Primary Site'].values[0])

Tstage = Counter(Tstage)
Nstage = Counter(Nstage)
region = Counter(region)

print("Larynx LN {}".format(region['larynx']))
print("Hypopharynx LN {}".format(region['hypopharynx']))
for k,v in Tstage.items():
    print("T{} {}".format(k,v))
for k,v in Nstage.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(neg_lns)))
print("Positive LN (ENE-) {}".format(len(meta_lns)))
print("Positive LN (ENE+) {}".format(len(ene_lns)))
print("RECIST stats:")
print("Meta (ENE+) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(ene_lns), max(ene_lns), np.mean(ene_lns), np.median(ene_lns))
)
print("Meta (ENE-) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(meta_lns), max(meta_lns), np.mean(meta_lns), np.median(meta_lns))
)
print("Negative RECIST range: {} - {}, mean: {}, median: {}".format(
        min(neg_lns), max(neg_lns), np.mean(neg_lns), np.median(neg_lns))
)
all_lns = neg_lns + meta_lns + ene_lns
all_gt_1cm = [ln >= 10 for ln in all_lns]
print("{}/{} ({}%) LNs SAD >= 1cm".format(sum(all_gt_1cm), len(all_lns), sum(all_gt_1cm) / len(all_lns) * 100))

print("================================================")
print("Patient level statistics:")

plvl_neg_cnt = []
plvl_meta_cnt = []
plvl_ene_cnt = []
plvl_Tstage_cnt = []
plvl_Nstage_cnt = []
plvl_region_cnt = []

assert set(plvl_neg.keys()) == set(plvl_meta.keys()) == set(plvl_ene.keys())
assert set(plvl_ene.keys()) == set(plvl_Tstage.keys()) == set(plvl_Nstage.keys()) == set(plvl_region.keys())

for k,v in plvl_neg.items():  # all the dict has the same keys including all patients
    if len(plvl_meta[k]) == 0 and len(plvl_ene[k]) == 0:
        plvl_neg_cnt.append(len(k))
    else:
        if len(plvl_ene[k]) > 0: # ENE+ patients are not counted in nodal meta ENE- patients
            plvl_ene_cnt.append(len(k))
        else:
            if len(plvl_meta[k]) > 0:
                plvl_meta_cnt.append(len(k))
            else:
                print("Error, patient {} has no positive or ENE LNs".format(k))
    
    if len(plvl_Tstage[k]) > 1:
        print("patient {} has multiple T stages: {}".format(k, plvl_Tstage[k]))
    if len(plvl_Tstage[k]) == 0:
        print("Warning: patient {} has no T stage may due to the RECIST filtering".format(k))
    for item in list(plvl_Tstage[k]):
        plvl_Tstage_cnt.append(item)

    if len(plvl_Nstage[k]) > 1:
        print("patient {} has multiple N stages: {}").format(k, plvl_Nstage[k])
    if len(plvl_Nstage[k]) == 0:
        print("Warning: patient {} has no N stage may due to the RECIST filtering".format(k))
    for item in plvl_Nstage[k]:
        plvl_Nstage_cnt.append(item)

    if len(plvl_region[k]) > 1:
        print("patient {} has multiple primary sites: {}").format(k, plvl_region[k])
    if len(plvl_region[k]) == 0:
        print("Warning: patient {} has no primary site may due to the RECIST filtering".format(k))
    for item in plvl_region[k]:
        plvl_region_cnt.append(item)


plvl_region_cnt = Counter(plvl_region_cnt)
plvl_Tstage_cnt = Counter(plvl_Tstage_cnt)
plvl_Nstage_cnt = Counter(plvl_Nstage_cnt)

for k,v in plvl_region_cnt.items():
    print("{}:{}".format(k,v))
for k,v in plvl_Tstage_cnt.items():
    print("T{} {}".format(k,v))
for k,v in plvl_Nstage_cnt.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(plvl_neg_cnt)))
print("Positive LN (ENE-) {}".format(len(plvl_meta_cnt)))
print("Positive LN (ENE+) {}".format(len(plvl_ene_cnt)))

Larynx LN 182
Hypopharynx LN 213
T2 103
T3 198
T4 94
N3 173
N2a 31
N2c 85
N2b 106
Negative LN 284
Positive LN (ENE-) 87
Positive LN (ENE+) 24
RECIST stats:
Meta (ENE+) RECIST range: 10.5397 - 41.636, mean: 22.970791666666667, median: 21.8886
Meta (ENE-) RECIST range: 5.5227 - 22.5576, mean: 12.4078091954023, median: 11.8293
Negative RECIST range: 5.0105 - 13.6638, mean: 7.187782746478874, median: 6.56285
111/395 (28.10126582278481%) LNs SAD >= 1cm
Patient level statistics:
hypopharynx:28
larynx:21
T2 14
T3 23
T4 12
N3 22
N2a 4
N2c 9
N2b 14
Negative LN 1
Positive LN (ENE-) 27
Positive LN (ENE+) 21


#### CGMH-larynx/oral

In [11]:
neg_lns, meta_lns, ene_lns = [], [], []
Tstage, Nstage = [], []
region = []

plvl_neg, plvl_meta, plvl_ene = {}, {}, {}
plvl_Tstage, plvl_Nstage = {}, {}
plvl_region = {}

for cgmh_crop_pth in external_data_pths[:2]:
    external_metadata = pd.read_csv('AI_external.csv')
    cropfile = osp.join(cgmh_crop_pth, "cropping_list.csv")
    df = pd.read_csv(cropfile)
    for idx, row in df.iterrows():
        pid = row['basename'].split('_ins')[0].replace('_diagct_ce', '')

        meta_row = external_metadata[external_metadata['ID'].str.rstrip(' ') == pid]
        if len(meta_row) == 0:
            print("Warning: {} not found in external metadata".format(pid))
            continue

        if 'oral' in row['basename'].lower():
            continue
        # if 'oral' not in row['basename'].lower():
        #         continue

        if pid not in plvl_neg:
            plvl_neg[pid] = []
        if pid not in plvl_meta:
            plvl_meta[pid] = []
        if pid not in plvl_ene:
            plvl_ene[pid] = []
        if pid not in plvl_Tstage:
            plvl_Tstage[pid] = set()
        if pid not in plvl_Nstage:
            plvl_Nstage[pid] = set()
        if pid not in plvl_region:
            plvl_region[pid] = set()

        if row['label'] == 0:
            if 'oral' not in row['basename'].lower() and float(row['recist']) < 5.0:
                continue # skip small negative LNs
            # if float(row['recist']) < 5.0:
            #     continue # skip small negative LNs
            neg_lns.append(np.round(float(row['recist']), 4))
            plvl_neg[pid].append(np.round(float(row['recist']), 4))
        else:
            if row['basename'] in ene_list:
                ene_lns.append(np.round(float(row['recist']), 4))
                plvl_ene[pid].append(np.round(float(row['recist']), 4))
            else:
                meta_lns.append(np.round(float(row['recist']), 4))
                plvl_meta[pid].append(np.round(float(row['recist']), 4))
        
        Tstage.append(str(meta_row['T'].values[0]))
        Nstage.append(str(meta_row['N'].values[0]))
        region.append(meta_row['Primary Site'].values[0])

        plvl_Tstage[pid].add(str(meta_row['T'].values[0]))
        plvl_Nstage[pid].add(str(meta_row['N'].values[0]))
        plvl_region[pid].add(meta_row['Primary Site'].values[0])
    
Tstage = Counter(Tstage)
Nstage = Counter(Nstage)
region = Counter(region)

for k,v in region.items():
    print("{}:{}".format(k,v))
for k,v in Tstage.items():
    print("T{} {}".format(k,v))
for k,v in Nstage.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(neg_lns)))
print("Positive LN (ENE-) {}".format(len(meta_lns)))
print("Positive LN (ENE+) {}".format(len(ene_lns)))
print("Larynx LN {}".format(region['larynx']))
print("Hypopharynx LN {}".format(region['hypopharynx']))
for k,v in Tstage.items():
    print("T{} {}".format(k,v))
for k,v in Nstage.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(neg_lns)))
print("Positive LN (ENE-) {}".format(len(meta_lns)))
print("Positive LN (ENE+) {}".format(len(ene_lns)))

print("RECIST stats:")
print("Meta (ENE+) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(ene_lns), max(ene_lns), np.mean(ene_lns), np.median(ene_lns))
)
print("Meta (ENE-) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(meta_lns), max(meta_lns), np.mean(meta_lns), np.median(meta_lns))
)
print("Negative RECIST range: {} - {}, mean: {}, median: {}".format(
        min(neg_lns), max(neg_lns), np.mean(neg_lns), np.median(neg_lns))
)
all_lns = neg_lns + meta_lns + ene_lns
all_gt_1cm = [ln >= 10 for ln in all_lns]
print("{}/{} ({}%) LNs SAD >= 1cm".format(sum(all_gt_1cm), len(all_lns), sum(all_gt_1cm) / len(all_lns) * 100))

print("================================================")
print("Patient level statistics:")

plvl_neg_cnt = []
plvl_meta_cnt = []
plvl_ene_cnt = []
plvl_Tstage_cnt = []
plvl_Nstage_cnt = []
plvl_region_cnt = []

assert set(plvl_neg.keys()) == set(plvl_meta.keys()) == set(plvl_ene.keys())
assert set(plvl_ene.keys()) == set(plvl_Tstage.keys()) == set(plvl_Nstage.keys()) == set(plvl_region.keys())

for k,v in plvl_neg.items():  # all the dict has the same keys including all patients
    if len(plvl_meta[k]) == 0 and len(plvl_ene[k]) == 0:
        plvl_neg_cnt.append(len(k))
    else:
        if len(plvl_ene[k]) > 0: # ENE+ patients are not counted in nodal meta ENE- patients
            plvl_ene_cnt.append(len(k))
        else:
            if len(plvl_meta[k]) > 0:
                plvl_meta_cnt.append(len(k))
            else:
                print("Error, patient {} has no positive or ENE LNs".format(k))
    
    if len(plvl_Tstage[k]) > 1:
        print("patient {} has multiple T stages: {}".format(k, plvl_Tstage[k]))
    if len(plvl_Tstage[k]) == 0:
        print("Warning: patient {} has no T stage may due to the RECIST filtering".format(k))
    for item in list(plvl_Tstage[k]):
        plvl_Tstage_cnt.append(item)

    if len(plvl_Nstage[k]) > 1:
        print("patient {} has multiple N stages: {}").format(k, plvl_Nstage[k])
    if len(plvl_Nstage[k]) == 0:
        print("Warning: patient {} has no N stage may due to the RECIST filtering".format(k))
    for item in plvl_Nstage[k]:
        plvl_Nstage_cnt.append(item)

    if len(plvl_region[k]) > 1:
        print("patient {} has multiple primary sites: {}").format(k, plvl_region[k])
    if len(plvl_region[k]) == 0:
        print("Warning: patient {} has no primary site may due to the RECIST filtering".format(k))
    for item in plvl_region[k]:
        plvl_region_cnt.append(item)


plvl_region_cnt = Counter(plvl_region_cnt)
plvl_Tstage_cnt = Counter(plvl_Tstage_cnt)
plvl_Nstage_cnt = Counter(plvl_Nstage_cnt)

for k,v in plvl_region_cnt.items():
    print("{}:{}".format(k,v))
for k,v in plvl_Tstage_cnt.items():
    print("T{} {}".format(k,v))
for k,v in plvl_Nstage_cnt.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(plvl_neg_cnt)))
print("Positive LN (ENE-) {}".format(len(plvl_meta_cnt)))
print("Positive LN (ENE+) {}".format(len(plvl_ene_cnt)))

larynx:75
hypopharynx:119
T2 24
T4 113
T3 47
T1 10
N3 77
N1 36
N0 81
Negative LN 161
Positive LN (ENE-) 10
Positive LN (ENE+) 23
Larynx LN 75
Hypopharynx LN 119
T2 24
T4 113
T3 47
T1 10
N3 77
N1 36
N0 81
Negative LN 161
Positive LN (ENE-) 10
Positive LN (ENE+) 23
RECIST stats:
Meta (ENE+) RECIST range: 5.1053 - 38.1872, mean: 13.697878260869565, median: 11.2478
Meta (ENE-) RECIST range: 6.052 - 15.0198, mean: 8.35881, median: 7.7547999999999995
Negative RECIST range: 5.0131 - 18.8948, mean: 6.810629192546584, median: 6.1146
26/194 (13.402061855670103%) LNs SAD >= 1cm
Patient level statistics:
larynx:11
hypopharynx:13
T2 3
T4 13
T3 7
T1 1
N3 10
N1 3
N0 11
Negative LN 12
Positive LN (ENE-) 2
Positive LN (ENE+) 11


#### TCGA

In [27]:
tcga_crop_pth = external_data_pths[-1]
neg_lns, meta_lns, ene_lns = [], [], []
Tstage, Nstage = [], []
region = []

plvl_neg, plvl_meta, plvl_ene = {}, {}, {}
plvl_Tstage, plvl_Nstage = {}, {}
plvl_region = {}

external_metadata = pd.read_csv('AI_external.csv')
cropfile = osp.join(tcga_crop_pth, "cropping_list.csv")
df = pd.read_csv(cropfile)
for idx, row in df.iterrows():
    pid = row['basename'].split('_ins')[0]
    meta_row = external_metadata[external_metadata['ID'].str.rstrip(' ') == pid]
    if len(meta_row) == 0:
        print("Warning: {} not found in external metadata".format(pid))
        continue

    if pid not in plvl_neg:
        plvl_neg[pid] = []
    if pid not in plvl_meta:
        plvl_meta[pid] = []
    if pid not in plvl_ene:
        plvl_ene[pid] = []
    if pid not in plvl_Tstage:
        plvl_Tstage[pid] = set()
    if pid not in plvl_Nstage:
        plvl_Nstage[pid] = set()
    if pid not in plvl_region:
        plvl_region[pid] = set()

    if row['label'] == 0:
        if float(row['recist']) < 5.0:
            continue
        neg_lns.append(np.round(float(row['recist']), 4))
        plvl_neg[pid].append(np.round(float(row['recist']), 4))
    else:
        if row['basename'] in ene_list:
            ene_lns.append(np.round(float(row['recist']), 4))
            plvl_ene[pid].append(np.round(float(row['recist']), 4))
        else:
            meta_lns.append(np.round(float(row['recist']), 4))
            plvl_meta[pid].append(np.round(float(row['recist']), 4))
    
    Tstage.append(str(meta_row['T'].values[0]))
    Nstage.append(str(meta_row['N'].values[0]))
    region.append(meta_row['Primary Site'].values[0])

    plvl_Tstage[pid].add(str(meta_row['T'].values[0]))
    plvl_Nstage[pid].add(str(meta_row['N'].values[0]))
    plvl_region[pid].add(meta_row['Primary Site'].values[0])
    
Tstage = Counter(Tstage)
Nstage = Counter(Nstage)
region = Counter(region)

for k,v in region.items():
    print("{}:{}".format(k,v))
for k,v in Tstage.items():
    print("T{} {}".format(k,v))
for k,v in Nstage.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(neg_lns)))
print("Positive LN (ENE-) {}".format(len(meta_lns)))
print("Positive LN (ENE+) {}".format(len(ene_lns)))

print("RECIST stats:")
print("Meta (ENE+) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(ene_lns), max(ene_lns), np.mean(ene_lns), np.median(ene_lns))
)
print("Meta (ENE-) RECIST range: {} - {}, mean: {}, median: {}".format(
        min(meta_lns), max(meta_lns), np.mean(meta_lns), np.median(meta_lns))
)
print("Negative RECIST range: {} - {}, mean: {}, median: {}".format(
        min(neg_lns), max(neg_lns), np.mean(neg_lns), np.median(neg_lns))
)
all_lns = neg_lns + meta_lns + ene_lns
all_gt_1cm = [ln >= 10 for ln in all_lns]
print("{}/{} ({}%) LNs SAD >= 1cm".format(sum(all_gt_1cm), len(all_lns), sum(all_gt_1cm) / len(all_lns) * 100))

print("================================================")
print("Patient level statistics:")

plvl_neg_cnt = []
plvl_meta_cnt = []
plvl_ene_cnt = []
plvl_Tstage_cnt = []
plvl_Nstage_cnt = []
plvl_region_cnt = []

assert set(plvl_neg.keys()) == set(plvl_meta.keys()) == set(plvl_ene.keys())
assert set(plvl_ene.keys()) == set(plvl_Tstage.keys()) == set(plvl_Nstage.keys()) == set(plvl_region.keys())

for k,v in plvl_neg.items():  # all the dict has the same keys including all patients
    if len(plvl_meta[k]) == 0 and len(plvl_ene[k]) == 0:
        plvl_neg_cnt.append(len(k))
    else:
        if len(plvl_ene[k]) > 0: # ENE+ patients are not counted in nodal meta ENE- patients
            plvl_ene_cnt.append(len(k))
        else:
            if len(plvl_meta[k]) > 0:
                plvl_meta_cnt.append(len(k))
            else:
                print("Error, patient {} has no positive or ENE LNs".format(k))
    
    if len(plvl_Tstage[k]) > 1:
        print("patient {} has multiple T stages: {}".format(k, plvl_Tstage[k]))
    if len(plvl_Tstage[k]) == 0:
        print("Warning: patient {} has no T stage may due to the RECIST filtering".format(k))
    for item in list(plvl_Tstage[k]):
        plvl_Tstage_cnt.append(item)

    if len(plvl_Nstage[k]) > 1:
        print("patient {} has multiple N stages: {}").format(k, plvl_Nstage[k])
    if len(plvl_Nstage[k]) == 0:
        print("Warning: patient {} has no N stage may due to the RECIST filtering".format(k))
    for item in plvl_Nstage[k]:
        plvl_Nstage_cnt.append(item)

    if len(plvl_region[k]) > 1:
        print("patient {} has multiple primary sites: {}").format(k, plvl_region[k])
    if len(plvl_region[k]) == 0:
        print("Warning: patient {} has no primary site may due to the RECIST filtering".format(k))
    for item in plvl_region[k]:
        plvl_region_cnt.append(item)


plvl_region_cnt = Counter(plvl_region_cnt)
plvl_Tstage_cnt = Counter(plvl_Tstage_cnt)
plvl_Nstage_cnt = Counter(plvl_Nstage_cnt)

for k,v in plvl_region_cnt.items():
    print("{}:{}".format(k,v))
for k,v in plvl_Tstage_cnt.items():
    print("T{} {}".format(k,v))
for k,v in plvl_Nstage_cnt.items():
    print("N{} {}".format(k,v))
print("Negative LN {}".format(len(plvl_neg_cnt)))
print("Positive LN (ENE-) {}".format(len(plvl_meta_cnt)))
print("Positive LN (ENE+) {}".format(len(plvl_ene_cnt)))

larynx:143
hypopharynx:67
T3 47
T2 15
T4 148
N1 14
N0 130
N2c 5
N2b 34
N3 12
N2a 15
Negative LN 192
Positive LN (ENE-) 17
Positive LN (ENE+) 1
RECIST stats:
Meta (ENE+) RECIST range: 9.2882 - 9.2882, mean: 9.2882, median: 9.2882
Meta (ENE-) RECIST range: 5.3891 - 26.2947, mean: 11.033323529411764, median: 9.9933
Negative RECIST range: 5.0272 - 12.5239, mean: 6.628421354166666, median: 6.2339
14/210 (6.666666666666667%) LNs SAD >= 1cm
Patient level statistics:
larynx:14
hypopharynx:5
T3 5
T2 3
T4 11
N1 2
N0 12
N2c 1
N2b 2
N3 1
N2a 1
Negative LN 12
Positive LN (ENE-) 6
Positive LN (ENE+) 1
