In [2]:
import tqdm
import pandas as pd 
import numpy as np 
import os
from itertools import chain
from sklearn.model_selection import train_test_split

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

In [4]:
img_labels_df = pd.read_csv('data/image_labels.csv')

img_labels_df.head()

Unnamed: 0,img_name,diagnosis,follow_up_num,patient_id,patient_age,patient_gender,view_position,original_width,original_height,original_x_spacing,original_y_spacing
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


In [5]:
all_diagnoses = list(set(chain(*img_labels_df['diagnosis'].map(lambda x: x.split('|')).tolist())))
for diagnosis in all_diagnoses:
    img_labels_df[diagnosis] = img_labels_df['diagnosis'].map(lambda diagnoses: diagnosis in diagnoses)

diagnosis_counts = {diagnosis: img_labels_df[diagnosis].sum() for diagnosis in all_diagnoses}

print(f"Dataframe Shape: {img_labels_df.shape}")
img_labels_df.head(10)

Dataframe Shape: (112120, 26)


Unnamed: 0,img_name,diagnosis,follow_up_num,patient_id,patient_age,patient_gender,view_position,original_width,original_height,original_x_spacing,...,Pneumonia,Cardiomegaly,No Finding,Edema,Hernia,Consolidation,Effusion,Nodule,Pneumothorax,Atelectasis
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,...,False,True,False,False,False,False,False,False,False,False
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,False,True,False,False,False,False,False,False,False,False
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,False,True,False,False,False,False,True,False,False,False
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,...,False,False,True,False,False,False,False,False,False,False
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,...,False,False,False,False,True,False,False,False,False,False
5,00000003_002.png,Hernia,1,3,75,F,PA,2048,2500,0.168,...,False,False,False,False,True,False,False,False,False,False
6,00000003_003.png,Hernia|Infiltration,2,3,76,F,PA,2698,2991,0.143,...,False,False,False,False,True,False,False,False,False,False
7,00000003_004.png,Hernia,3,3,77,F,PA,2500,2048,0.168,...,False,False,False,False,True,False,False,False,False,False
8,00000003_005.png,Hernia,4,3,78,F,PA,2686,2991,0.143,...,False,False,False,False,True,False,False,False,False,False
9,00000003_006.png,Hernia,5,3,79,F,PA,2992,2991,0.143,...,False,False,False,False,True,False,False,False,False,False


In [6]:
img_labels_df['vectorized_diagnoses'] = img_labels_df.apply(lambda x: [x[all_diagnoses].values], 1).map(lambda x: [float(val) for val in x[0]])

In [15]:
split: list[pd.DataFrame] = train_test_split(img_labels_df, test_size=0.15, random_state=2003)
training_df, validation_df = split
print(f"Training Size: {len(training_df)}\nValidation Size: {len(validation_df)}")

Training Size: 95302
Validation Size: 16818


In [18]:
training_df.sample(10)

Unnamed: 0,img_name,diagnosis,follow_up_num,patient_id,patient_age,patient_gender,view_position,original_width,original_height,original_x_spacing,...,Cardiomegaly,No Finding,Edema,Hernia,Consolidation,Effusion,Nodule,Pneumothorax,Atelectasis,vectorized_diagnoses
26448,00006948_001.png,Mass,1,6948,50,F,AP,2500,2048,0.171,...,False,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
51927,00013111_095.png,Atelectasis|Infiltration,60,13111,23,M,PA,2992,2991,0.143,...,False,False,False,False,False,False,False,False,True,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
96126,00025290_026.png,Infiltration|Pleural_Thickening,26,25290,45,F,AP,3056,2544,0.139,...,False,False,False,False,False,False,False,False,False,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
88504,00021920_001.png,No Finding,1,21920,31,M,PA,2710,2485,0.143,...,False,True,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
44172,00011379_041.png,Pneumothorax,41,11379,9,M,AP,2048,2500,0.168,...,False,False,False,False,False,False,False,True,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
15813,00004176_000.png,Infiltration,0,4176,37,F,PA,2048,2500,0.171,...,False,False,False,False,False,False,False,False,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
29174,00007602_000.png,Pleural_Thickening,0,7602,61,M,PA,2500,2048,0.168,...,False,False,False,False,False,False,False,False,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
74643,00018329_001.png,Infiltration,1,18329,43,M,AP,2500,2048,0.168,...,False,False,False,False,False,False,False,False,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
15545,00004069_002.png,No Finding,8,4069,44,F,PA,1950,1790,0.194311,...,False,True,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
103706,00027685_003.png,Cardiomegaly|Effusion,3,27685,61,F,PA,2690,2778,0.143,...,True,False,False,False,False,True,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [17]:
validation_df.sample(10)

Unnamed: 0,img_name,diagnosis,follow_up_num,patient_id,patient_age,patient_gender,view_position,original_width,original_height,original_x_spacing,...,Cardiomegaly,No Finding,Edema,Hernia,Consolidation,Effusion,Nodule,Pneumothorax,Atelectasis,vectorized_diagnoses
40299,00010499_000.png,No Finding,0,10499,28,M,AP,2500,2048,0.168,...,False,True,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
27422,00007136_001.png,Mass,1,7136,28,F,PA,2048,2500,0.168,...,False,False,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
21302,00005682_001.png,Pleural_Thickening,1,5682,73,M,PA,2500,2048,0.171,...,False,False,False,False,False,False,False,False,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50618,00012834_023.png,Infiltration,23,12834,33,M,AP,2048,2500,0.168,...,False,False,False,False,False,False,False,False,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
42482,00010936_004.png,Infiltration,0,10936,35,F,PA,2810,2829,0.143,...,False,False,False,False,False,False,False,False,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
66552,00016453_000.png,Effusion,0,16453,57,M,PA,2992,2991,0.143,...,False,False,False,False,False,True,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20982,00005603_000.png,No Finding,0,5603,9,F,PA,2048,2500,0.171,...,False,True,False,False,False,False,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
44561,00011460_035.png,Infiltration|Pneumothorax,35,11460,16,M,PA,2992,2991,0.143,...,False,False,False,False,False,False,False,True,False,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
40538,00010535_008.png,Cardiomegaly|Effusion,8,10535,63,M,PA,2992,2991,0.143,...,True,False,False,False,False,True,False,False,False,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
17243,00004623_004.png,Pleural_Thickening,4,4623,54,F,AP,2500,2048,0.168,...,False,False,False,False,False,False,False,False,False,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
