# Import libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf 

# Functions/Classes

In [2]:
class Info_Class:
    def __init__(self, pathologies, class_weights, target_size, steps_per_epoch, validation_steps):
        self.pathologies      = pathologies
        self.class_weights    = class_weights
        self.target_size      = target_size
        self.steps_per_epoch  = steps_per_epoch
        self.validation_steps = validation_steps

In [3]:
def cleaning_up_dataframe(data, classes):
    """ Label Structure
        positive (exist):            1.0
        negative (doesn't exist):   -1.0
        Ucertain                     0.0
        no mention                   NaN """

    # changing all no mention labels to negative
    data = data[data['AP/PA']=='AP']
    data = data[data['Frontal/Lateral']=='Frontal']
    data = data.replace(np.nan,-1.0)

    # appending the path to each sample
    data = data[ ['Path'] + classes ]

    for column in classes:
        # data[column] = data[column].astype(int)
        data[column] = data[column].replace(1,'pos')
        data[column] = data[column].replace(-1,'neg')
        data[column] = data[column].replace(0,'uncertain')
        
    return data

def removing_uncertain_samples(data, pathologies):
    """ Label Structure
        positive (exist):            1.0
        negative (doesn't exist):   -1.0
        Ucertain                     0.0
        no mention                   NaN """
                
    for name in pathologies:
        data = data.loc[data[name]!='uncertain']

    # changing negative from -1.0 to 0.0
    # data = data.replace(-1.0,0.0)

    return data



## Selecting the pathologies

In [4]:
pathologies = ["Enlarged Cardiomediastinum" , "Cardiomegaly" , "Lung Opacity" , "Lung Lesion", "Edema" , "Consolidation" , "Pneumonia" , "Atelectasis" , "Pneumothorax" , "Pleural Effusion" , "Pleural Other" , "Fracture" , "Support Devices"]

## Loading the raw table

In [5]:
""" Label Structure
    positive (exist):            1.0
    negative (doesn't exist):   -1.0
    Ucertain                     0.0
    no mention                   NaN """


# dir = '/Users/artinmac/GoogleDrive/RESEARCH/projects/Data7.chest_xray'
dir = '/groups/jjrodrig/projects/chest/dataset/chexpert'
train = pd.read_csv(dir + '/train.csv')
test  = pd.read_csv(dir + '/valid.csv')

print('before sample-pruning')
print('train:',train.shape)
print('test:',test.shape)

before sample-pruning
train: (223414, 19)
test: (234, 19)


## Extracting the pathologies of interest

In [6]:
train = cleaning_up_dataframe(train, pathologies)
test  = cleaning_up_dataframe(test, pathologies)

## Selecting a few cases

In [7]:
# train = train.iloc[:1000,:]

## separating the uncertain samples

In [8]:
train_uncertain = train.copy()
for name in pathologies:
    train = train.loc[train[name]!='uncertain']
    
train_uncertain = train_uncertain.drop(train.index)

## Splitting train/validatiion

In [9]:
valid = train.sample(frac=0.2)
train = train.drop(valid.index)

print('after sample-pruning')
print('train (certain):',train.shape)
print('train (uncertain):',train_uncertain.shape)
print('valid:',valid.shape)
print('test:',test.shape)


after sample-pruning
train (certain): (77894, 14)
train (uncertain): (64223, 14)
valid: (19473, 14)
test: (169, 14)


## Class weights

In [10]:
L = len(pathologies)
class_weights = np.ones(L)/L

# Keras Generator

In [11]:
generator = tf.keras.preprocessing.image.ImageDataGenerator()

target_size =  (224,224) # (64,64)  #
class_mode='raw'
color_mode = 'rgb'
y_col = list(pathologies) #'disease_vector'
batch_size=30

## Train/Validation generator

In [12]:
train_generator = generator.flow_from_dataframe(dataframe=train, x_col='Path', y_col=y_col,color_mode=color_mode,directory=dir, target_size=target_size, batch_size=10000, class_mode=class_mode, shuffle=False)

valid_generator = generator.flow_from_dataframe(dataframe=valid, x_col='Path', y_col=y_col,color_mode=color_mode,directory=dir, target_size=target_size, batch_size=10000, class_mode=class_mode, shuffle=False)  

(x_train, y_train) = next(train_generator)
steps_per_epoch = int(x_train.shape[0]/batch_size)
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().batch(batch_size)


(x_valid, y_valid) = next(valid_generator)
validation_steps = int(x_valid.shape[0]/batch_size)
valid_data = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)) 
valid_data = valid_data.repeat().batch(batch_size)

Info = Info_Class(pathologies, class_weights, target_size, steps_per_epoch, validation_steps)

Found 77894 validated image filenames.
Found 19473 validated image filenames.


## Test generator

In [13]:
test_generator = generator.flow_from_dataframe(dataframe=test, x_col='Path', y_col=y_col,color_mode=color_mode,directory=dir, target_size=target_size, batch_size=1, class_mode=class_mode, shuffle=False)

Info = Info_Class(pathologies, class_weights, target_size, '', '')

Found 169 validated image filenames.


import pandas as pd
from itertools import chain
import numpy as np
import tensorflow as tf 
import matplotlib.pyplot as plt