In [1]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle

df = pd.read_csv('sample/sample_labels.csv')

diseases = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation','Infiltration','Fibrosis','Pneumonia']
#Number diseases
for disease in diseases :
    df[disease] = df['Finding Labels'].apply(lambda x: 1 if disease in x else 0)

# #test to perfect
# df = df.drop(df[df['Emphysema']==0][:-127].index.values)
    
#remove Y after age
df['Age']=df['Patient Age'].apply(lambda x: x[:-1]).astype(int)
df['Age Type']=df['Patient Age'].apply(lambda x: x[-1:])
df.loc[df['Age Type']=='M',['Age']] = df[df['Age Type']=='M']['Age'].apply(lambda x: round(x/12.)).astype(int)
df.loc[df['Age Type']=='D',['Age']] = df[df['Age Type']=='D']['Age'].apply(lambda x: round(x/365.)).astype(int)
# remove outliers
df = df.drop(df['Age'].sort_values(ascending=False).head(1).index)
df['Age'] = df['Age']/df['Age'].max()

#one hot data
# df = df.drop(df.index[4242])
df = df.join(pd.get_dummies(df['Patient Gender']))
df = df.join(pd.get_dummies(df['View Position']))

#random samples
df = shuffle(df)

#get other data
data = df[['Age', 'F', 'M', 'AP', 'PA']]
data = np.array(data)

labels = df[diseases].as_matrix()
files_list = ('sample/images/' + df['Image Index']).tolist()

# #test to perfect
# labelB = df['Emphysema'].tolist()

labelB = (df[diseases].sum(axis=1)>0).tolist()
labelB = np.array(labelB, dtype=int)

# RGB images

In [2]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path, shape):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=shape)
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)/255
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

train_labels = labelB[:3400][:, np.newaxis]
valid_labels = labelB[3400:4500][:, np.newaxis]
test_labels = labelB[4500:][:, np.newaxis]

train_data = data[:3400]
valid_data = data[3400:4500]
test_data = data[4500:]

img_shape = (64, 64)
train_tensors = paths_to_tensor(files_list[:3400], shape = img_shape)
valid_tensors = paths_to_tensor(files_list[3400:4500], shape = img_shape)
test_tensors = paths_to_tensor(files_list[4500:], shape = img_shape)

100%|██████████| 3400/3400 [00:53<00:00, 64.04it/s]
100%|██████████| 1100/1100 [00:17<00:00, 61.92it/s]
100%|██████████| 1105/1105 [00:17<00:00, 62.04it/s]


In [3]:
import pickle

train_filename = "data_preprocessed/train_data_sample_rgb.p"
pickle.dump((train_labels, train_data, train_tensors), open(train_filename, 'wb'))

valid_filename = "data_preprocessed/valid_data_sample_rgb.p"
pickle.dump((valid_labels, valid_data, valid_tensors), open(valid_filename, 'wb'))

test_filename = "data_preprocessed/test_data_sample_rgb.p"
pickle.dump((test_labels, test_data, test_tensors), open(test_filename, 'wb'))

# Gray images

In [4]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path, shape):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, grayscale=True, target_size=shape)
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 1)
    x = image.img_to_array(img)/255
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 1) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths, shape):
    list_of_tensors = [path_to_tensor(img_path, shape) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

train_labels = labelB[:3400][:, np.newaxis]
valid_labels = labelB[3400:4500][:, np.newaxis]
test_labels = labelB[4500:][:, np.newaxis]

train_data = data[:3400]
valid_data = data[3400:4500]
test_data = data[4500:]

img_shape = (64, 64)
train_tensors = paths_to_tensor(files_list[:3400], shape = img_shape)
valid_tensors = paths_to_tensor(files_list[3400:4500], shape = img_shape)
test_tensors = paths_to_tensor(files_list[4500:], shape = img_shape)

100%|██████████| 3400/3400 [00:43<00:00, 78.34it/s]
100%|██████████| 1100/1100 [00:12<00:00, 84.98it/s]
100%|██████████| 1105/1105 [00:13<00:00, 84.73it/s]


In [5]:
import pickle

train_filename = "data_preprocessed/train_data_sample_gray.p"
pickle.dump((train_labels, train_data, train_tensors), open(train_filename, 'wb'))

valid_filename = "data_preprocessed/valid_data_sample_gray.p"
pickle.dump((valid_labels, valid_data, valid_tensors), open(valid_filename, 'wb'))

test_filename = "data_preprocessed/test_data_sample_gray.p"
pickle.dump((test_labels, test_data, test_tensors), open(test_filename, 'wb'))