In [50]:
import pandas as pd
import numpy as np
def create_dataframe(clinical_characteristics_df, weak_labels_df, full_labels_df):
    
    def _calc_num_slices_columns(row, pool='sum'):
        assert pool in ['sum', 'max']
        num_slices = np.array([float(s) for s in str(row['n_slices']).split('|') if s.isnumeric()])
        if len(num_slices) == 0:
            return float('nan')
        if pool == 'max':
            return num_slices.max()
        elif pool =='sum':
            return num_slices.sum()
        
        
    weak_labels_df = weak_labels_df[["SubjectID", "Gelabeled", "n_slices"]]
    full_labels_df = full_labels_df[['SubjectID', "Gesegmenteerd?", "Data split"]]
    df = pd.merge(weak_labels_df, full_labels_df, on=["SubjectID"], how='outer')
    df = pd.merge(df, clinical_characteristics_df, on=["SubjectID"], how='outer')
    
    df['sum_slices'] = df.apply (lambda row: _calc_num_slices_columns(row, pool='sum'), axis=1)
    df['max_slices'] = df.apply (lambda row: _calc_num_slices_columns(row, pool='max'), axis=1)
    
    #df = pd.concat([clinical_characteristics_df, weak_labels_df, full_labels_df], axis=0, join='inner')
    return df

def visualize_characteristics(df, rel_cols=None, num_cols=None):
    def _visualize_subset(sub_df):
        if rel_cols != None:
            for c in rel_cols:
                assert c in sub_df.columns, f"Column {c} not in dataframe"
        print("----"*20)
        total = len([s for s in sub_df.SubjectID.unique() if not s.isspace()]) # don't count empty subjectID '' 
        print(f"Total: {total}")
        for col in sub_df.columns:
            if rel_cols != None:
                if col not in rel_cols:
                    continue
                    
            unique_vals = sub_df[col].unique()
            # approximation, assume all categorical values have less than 5 values
            categorical = len(unique_vals) < 5
            if categorical == True:
                print(f"Column: {col}")
                for v in unique_vals:
                    count = (sub_df[col] == v).sum()
                    print(f"\t {v}: {count} / ({count/total*100:.2f}%)")
            else:
                if num_cols != None:
                    if col in num_cols:
                        vals = np.array([float(str(x).replace(",", ".")) for x in sub_df[col] if not str(x).isspace()])
                        print(f"Column {col}, mean {np.nanmean(vals):.2f}/ std ({np.nanstd(vals):.2f})")
        return
    
    show_columns = []
    weakly_labeled = df[df['Gelabeled'] == 'Ja']
    weakly_labeled_train = weakly_labeled[~weakly_labeled['Data split'].isin(['validation', 'test'])]
    fully_labeled = df[df['Gesegmenteerd?'] == 'Ja']
    fully_labeled = fully_labeled[~fully_labeled['SubjectID'].isin(['DRAUMC0669', 'DRAUMC0851', 'DRAUMC1231', 'DRAUMC1236'])] # not checked by fleur
    fully_labeled = fully_labeled[~fully_labeled['SubjectID'].isin(['DRAUMC0587'])] # manual segmentation on cropped PSIR image, cropped MAG not available
    fully_labeled_train = fully_labeled[fully_labeled['Data split'].isin(['train'])]
    fully_labeled_val = fully_labeled[fully_labeled['Data split'].isin(['validation'])]
    fully_labeled_test = fully_labeled[fully_labeled['Data split'].isin(['test'])]
    #1. visualize all data
    print("----"*20)
    print("FULL DATASET")
    _visualize_subset(df)
    #2. visualize slice-level weakly-labeled data
    print("----"*20)
    print("SLICE-LABELED TRAIN DATASET")
    _visualize_subset(weakly_labeled_train)
    #3. visualize stack-level weakly labeled data
    
    #4. visualize fully labeled data // Train
    print("----"*20)
    print("FULLY-LABELED TRAIN DATASET")
    _visualize_subset(fully_labeled_train)
    #5. visualize fully labeled data // Val
    print("----"*20)
    print("FULLY-LABELED VALIDATION DATASET")
    _visualize_subset(fully_labeled_val)
    #6. Visualize fully labeled data // Test
    print("----"*20)
    print("FULLY-LABELED TEST DATASET")
    _visualize_subset(fully_labeled_test)
    
    
    return

In [51]:
char_df = pd.read_csv(r"\\amc.intra\users\R\rcklein\home\deeprisk\sheets\DeepRiskICD_Database_n=1261_2sept2021_export8nov2021.csv", sep=None)
weak_labels_df = pd.read_excel(r"\\amc.intra\users\R\rcklein\home\deeprisk\weakly_supervised\data\weak_labels_n=657.xlsx")
full_labels_df = pd.read_excel(r"\\amc.intra\users\R\rcklein\home\deeprisk\sheets\full_labels_15june2022.xlsx")

  char_df = pd.read_csv(r"\\amc.intra\users\R\rcklein\home\deeprisk\sheets\DeepRiskICD_Database_n=1261_2sept2021_export8nov2021.csv", sep=None)


In [52]:
#print(weak_labels_df.columns)
#print(full_labels_df.columns)
#print(char_df.columns)
df = create_dataframe(char_df, weak_labels_df, full_labels_df)
print(len(df))
print(len(df['SubjectID'].unique()))
print(df.columns)

1293
1262
Index(['SubjectID', 'Gelabeled', 'n_slices', 'Gesegmenteerd?', 'Data split',
       'Geslacht', 'Age', 'Ischemische_ziekte',
       'Soort_ischemische_ziekte#Stabiel_coronairlijden',
       'Soort_ischemische_ziekte#Myocard_infarct',
       ...
       'CMRontslag', 'folllow_up_MK_calc', 'day_to_ICDtherapy_1y_MK',
       'day_to_ICDtherapy_2y_MK', 'day_to_ICDtherapy_3y_MK',
       'day_to_ICDtherapy_4y_MK', 'day_to_ICDtherapy_5y_MK',
       'day_to_ICDtherapy_10y_MK', 'sum_slices', 'max_slices'],
      dtype='object', length=704)


In [53]:
# check for double ocurring SubjectIDs
for sid in df['SubjectID'].unique():
    if len(df[df['SubjectID'] == sid]) > 1:
        print("Multiple with subject id :", sid, "!")
        #print(df[df['SubjectID'] == sid])

Multiple with subject id :   !


In [54]:
# check if all segmented patients have weak labels
full = df[(df['Gesegmenteerd?'] == 'Ja')]
full[full['Gelabeled'] != 'Ja']

Unnamed: 0,SubjectID,Gelabeled,n_slices,Gesegmenteerd?,Data split,Geslacht,Age,Ischemische_ziekte,Soort_ischemische_ziekte#Stabiel_coronairlijden,Soort_ischemische_ziekte#Myocard_infarct,...,CMRontslag,folllow_up_MK_calc,day_to_ICDtherapy_1y_MK,day_to_ICDtherapy_2y_MK,day_to_ICDtherapy_3y_MK,day_to_ICDtherapy_4y_MK,day_to_ICDtherapy_5y_MK,day_to_ICDtherapy_10y_MK,sum_slices,max_slices


In [55]:
RELEVANT_COLUMNS = ['Gelabeled', 'n_slices', 'Gesegmenteerd?',
                   'Data split', 'Geslacht', 'Age',
                   'Ischemische_ziekte', 'Soort_ischemische_ziekte#Stabiel_coronairlijden', 'Soort_ischemische_ziekte#Myocard_infarct',
                   'Soort_ischemische_ziekte#PCI', 'Soort_ischemische_ziekte#CABG',
                   'ICD_LGE_aanwezig', 'sum_slices', 'max_slices']
NUMERICAL_COLUMNS = ['Age', 'sum_slices', 'max_slices']
visualize_characteristics(df, rel_cols=RELEVANT_COLUMNS, num_cols=NUMERICAL_COLUMNS)

--------------------------------------------------------------------------------
FULL DATASET
--------------------------------------------------------------------------------
Total: 1261
Column: Gelabeled
	 Ja: 483 / (38.30%)
	 Uitgevallen: 117 / (9.28%)
	 Onduidelijk: 57 / (4.52%)
	 nan: 0 / (0.00%)
Column: Gesegmenteerd?
	 Ja: 125 / (9.91%)
	 nan: 0 / (0.00%)
	 Uitgevallen: 47 / (3.73%)
	 Nog niet: 44 / (3.49%)
Column: Data split
	 train: 88 / (6.98%)
	 nan: 0 / (0.00%)
	 validation: 12 / (0.95%)
	 test: 25 / (1.98%)
Column: Geslacht
	 0: 964 / (76.45%)
	 1: 297 / (23.55%)
	  : 32 / (2.54%)
Column Age, mean 64.74/ std (12.04)
Column: Ischemische_ziekte
	 1: 797 / (63.20%)
	 0: 464 / (36.80%)
	  : 32 / (2.54%)
Column: Soort_ischemische_ziekte#Stabiel_coronairlijden
	 0: 1216 / (96.43%)
	 1: 45 / (3.57%)
	  : 32 / (2.54%)
Column: Soort_ischemische_ziekte#Myocard_infarct
	 1: 628 / (49.80%)
	 0: 633 / (50.20%)
	  : 32 / (2.54%)
Column: Soort_ischemische_ziekte#PCI
	 0: 786 / (62.33%)
	 

In [56]:
for x in df.columns:
    print(x)

SubjectID
Gelabeled
n_slices
Gesegmenteerd?
Data split
Geslacht
Age
Ischemische_ziekte
Soort_ischemische_ziekte#Stabiel_coronairlijden
Soort_ischemische_ziekte#Myocard_infarct
Soort_ischemische_ziekte#PCI
Soort_ischemische_ziekte#CABG
cardiomyopathie
Overig_CMP
Afib
Aflutter
Geleidingsstoornis
Ventriculaire_tachycardie#Nee
Ventriculaire_tachycardie#nsVT
Ventriculaire_tachycardie#sustained_VT
Ventriculaire_tachycardie#VF
OHCA
Ablatie
Pacemaker
NYHA_klasse
ICD_LVEF
ICD_LVEF_AANGEPAST_29_4
ICD_LVEF_6_categorien
ICD_LVEF_5_categorien
ICD_Datum_LVEF
ICD_LVEF_methode
ICD_LVEF_bepaald
Opmerkingen_VG
Vitamine_K
Plaatjes_aggregatie
NOAC
Betablokker
ARB
Aldosteronremmer
Diuretica
Anti_aritmica
Antiaritmica_soort#Amiodarone
Antiaritmica_soort#Digoxine
Antiaritmica_soort#Sotalol
Antiaritmica_soort#Overig
Opmerkingen_med
ICD_Date_ECG
Hartfrequentie
Ritme_ecg
Overig_ritme_ecg
QRS
Geleidingsstoornis_ecg
Hartas
Opmerking_ECG
Diabetes
COPD
CVA
Roken
Hypertensie
Hypercholesterolaemie
Familie_HVZ
Perifee

In [57]:
df['cardiomyopathie'].unique()

array(['1', '2', '3', '0', '4', ' '], dtype=object)

In [58]:
(df[df['Data split'].isin(['train', 'validation', 'test'])]['cardiomyopathie'].isin(['0', '1', '2', '3', '4'])).sum()

125

In [59]:
len(list(df['SubjectID'][(df['Gesegmenteerd?'] == 'Ja') & df['Data split'].isin(['train'])]))

86

In [48]:
len(list(df['SubjectID'][(df['Gesegmenteerd?'] == 'Ja') & df['Data split'].isin(['validation'])]))

12

In [49]:
len(list(df['SubjectID'][(df['Gesegmenteerd?'] == 'Ja') & df['Data split'].isin(['test'])]))

24