## Compare distributions in two datsets

## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

In [2]:
all_xray_df = pd.read_csv('Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join(os.getcwd(), './data/images_all', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

Scans found: 0 , Total Headers 112120


In [3]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
# print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

In [4]:
all_xray_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,,0,2,81,M,PA,2500,2048,0.171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('|', ','))

In [6]:
all_xray_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,"Cardiomegaly,Emphysema",1,1,58,M,PA,2894,2729,0.143,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,"Cardiomegaly,Effusion",2,1,58,M,PA,2500,2048,0.168,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,,0,2,81,M,PA,2500,2048,0.171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer


classes = ['Atelectasis', 'Cardiomegaly', 'Consolidation',
           'Edema', 'Effusion', 'Emphysema', 
           'Fibrosis', "Infiltration", 'Hernia', 
           "Mass", 'Nodule', 'Pleural_Thickening', 
           'Pneumonia', 'Pneumothorax']

encoder = MultiLabelBinarizer(classes=classes)
labels = encoder.fit_transform([c.split(",") for c in list(all_xray_df["Finding Labels"])])



In [8]:
labels

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
df = pd.DataFrame()

In [10]:
df["labels"] = labels.tolist()

In [11]:
df["labels"] = df["labels"].astype(str)

In [12]:
df["Image Index"] = all_xray_df["Image Index"]

In [13]:
df.head()

Unnamed: 0,labels,Image Index
0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",00000001_000.png
1,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",00000001_001.png
2,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",00000001_002.png
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",00000002_000.png
4,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",00000003_000.png


In [14]:
def get_class_count(df):
    grp = df.groupby(["labels"])["Image Index"].nunique()
    
    return {key: grp[key] for key in list(grp.keys())}



def get_probibilities_from_both_df(dr_orig, df_new):
    class_count_orig = get_class_count(dr_orig)
    
    class_count_new = get_class_count(df_new)
    
    probabilites_orig, probabilites_new = [], [] 
    
    for c, count_o in class_count_orig.items():
        probabilites_orig.append(float(count_o)/len(dr_orig))
        
        try:
            count_new = class_count_new[c]
        except KeyError as e:
            count_new = 0.0
        
        probabilites_new.append(float(count_new)/len(df_new))
        
    return probabilites_orig, probabilites_new




## Count divergence for nomal spit with seed

In [15]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, 
                                   test_size = 0.2, 
                                   random_state = 2137)

## Train divergence

In [16]:
P, Q = get_probibilities_from_both_df(df, train_df)
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)



tensor(-0.0035)

## Test divergence

In [17]:
P, Q = get_probibilities_from_both_df(df, valid_df)
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0034)

## Train test divergence

In [18]:
P, Q = get_probibilities_from_both_df(train_df, valid_df)  
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0037)

# With stratify

In [19]:
train_df, valid_df = train_test_split(df, 
                                   test_size = 0.2, 
                                   random_state = 2137,
                                   stratify=df["labels"].map(lambda x:x[:13]))

## Train divergence

In [20]:
P, Q = get_probibilities_from_both_df(df, train_df)  
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)



tensor(-0.0035)

## Test divergence

In [21]:
P, Q = get_probibilities_from_both_df(df, valid_df)  
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0035)

## Test-Train divergence

In [22]:
P, Q = get_probibilities_from_both_df(train_df, valid_df)  
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0038)

# With official data split

In [23]:
train_val_list = pd.read_fwf('train_val_list.txt', header=None)
train_val_list = train_val_list.squeeze()
train_df = all_xray_df.loc[all_xray_df['Image Index'].isin(train_val_list)]
test_list = pd.read_fwf('test_list.txt', header=None)
test_list = test_list.squeeze()
test_df = all_xray_df.loc[all_xray_df['Image Index'].isin(test_list)]

In [24]:
len(train_df)

86524

In [25]:
len(test_df)

25596

In [26]:
official_train_labels = encoder.fit_transform([c.split(",") for c in list(train_df["Finding Labels"])])

official_test_labels = encoder.fit_transform([c.split(",") for c in list(test_df["Finding Labels"])])



In [27]:
df_official_train = pd.DataFrame()
df_official_test = pd.DataFrame()

In [28]:
df_official_train["Image Index"] = train_df["Image Index"]
df_official_train["labels"] = official_train_labels.tolist()
df_official_train["labels"] = df_official_train["labels"].astype(str)

df_official_train.head()

Unnamed: 0,Image Index,labels
0,00000001_000.png,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,00000001_001.png,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,00000001_002.png,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,00000002_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
12,00000004_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"


In [29]:
df_official_test["Image Index"] = test_df["Image Index"]
df_official_test["labels"] = official_test_labels.tolist()
df_official_test["labels"] = df_official_test["labels"].astype(str)

In [30]:
df_official_test.head()

Unnamed: 0,Image Index,labels
4,00000003_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
5,00000003_001.png,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
6,00000003_002.png,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
7,00000003_003.png,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]"
8,00000003_004.png,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"


## Train divergence

In [31]:
P, Q = get_probibilities_from_both_df(df, df_official_train)
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0032)

## Test divergence

In [32]:
P, Q = get_probibilities_from_both_df(df, df_official_test)
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0043)

## Train test divergence

In [33]:
P, Q = get_probibilities_from_both_df(df_official_train, df_official_test)
P = torch.tensor(P)
Q = torch.tensor(Q)
F.kl_div(P, Q)

tensor(-0.0055)