In [10]:
import pandas as pd
import numpy as np
import os
import cv2
import PIL
from PIL import Image, ImageFile, Jpeg2KImagePlugin
import pickle


import imageio
from pydicom import dcmread
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [11]:
train_df = pd.read_csv('train.csv')
train_img = ('train/')
ids = train_df.image_id

In [242]:
# for each unique image_id, read the dicom file, extract the pixel array then resize and flatten to be exported as a .npy
# file for easy reuse for modeling

ids = train_df.image_id.unique()

df_data = []

for n, id_ in enumerate(ids):
    dicom_path = train_img + id_ + '.dicom'
    dicom = pydicom.dcmread(dicom_path)
    
    data = dicom.pixel_array
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    im = Image.fromarray(data)
    
    new_im = im.resize((256,256))
    npdata = np.asarray(new_im)
    
    npdata = npdata.flatten().reshape(1, 65536)
    df_data.append(npdata)


In [39]:
# save numpy arrays as a numpy zip file (.npz)

np.savez('arrays.npz', *df_data)

In [41]:
# load file 

files = np.load('arrays.npz')

In [42]:
# iterate through arrays to pull each one

file = [files[key] for key in files]

In [241]:
# verify the process worked

file

In [15]:
# examine first rows of dataframe

train_df.head()

In [16]:
# there are 17 different radiologists, split into groups of 3, who labeled each image

train_df.rad_id.unique()

In [17]:
# all null values are accounted for. Any 'class-name' that equals 'No finding', does not have any border box coordinates.
# Keep in Dataframe, as there is ambiguity with labeling.

train_df.info()

In [5]:
# replace nan values in bounding box columns with zeros

train_df = train_df.fillna(0)

In [18]:
# verify change

train_df.head()

In [9]:
# pull all values of image_id in dataframe. Note how there are over 67k image_ids but only 15000 unique ids

ids = train_df.image_id
unique_ids = train_df.image_id.unique()

In [19]:
# this means there are images with multiple labels

print(len(ids))
print(len(unique_ids))

In [20]:
# overwhelming majority of images have at least 1 'No finding' measurement

train_df.class_name.value_counts()

In [21]:
plt.barh(train_df.class_name.value_counts(ascending=True).index, train_df.class_name.value_counts(ascending=True))
plt.title('Gross label count')
plt.xlabel('Observations')
plt.ylabel('Labeled Condition')
plt.show()

In [12]:
# create a df by matching each unique id, and pull corresponding class labels, their counts and the indicies

labels = []
lab_id = []
for id_ in unique_ids:
    df = train_df[train_df.image_id == id_]
    label = df.iloc[:,1].value_counts()
    label2 = df.iloc[:,1].value_counts().index
    
    labels.append([label])
    lab_id.append(id_)

In [13]:
# create a single list of from the lists to iterate through

file = zip(lab_id, labels)
        

In [248]:
# Iterate through the zipped list to find image ids with the highest value count of of each possible class label

for x,y in file:
    for l in y:
        a = x
        b = l.index
        c = l[0]
        if b[0] == 'Pulmonary fibrosis':
            if c > 5:
                print(a, l)

In [17]:
# list of labels

lab_list = [x for x in train_df.class_name.unique()]

In [22]:
lab_list

In [14]:
# images with the highest concentration of labeling for each class
### note that 'No finding' images have a limit of 3 labels: a single label from each of the three radiologists, so all 
### 'No finding' images had identical class labels

high_ims = {'Cardiomegaly': 'd61eb45d47ad48020286203b1f1362f8', 
            'Aortic enlargement': 'e82620b01bbc77792885029d3cd0d8ae', 
            'Pleural thickening': 'e31be972e181987a8600a8700c1ebe88', 'ILD': 'd3823d24855b6ef03c188e962948b4b9', 
            'Nodule/Mass': '03e6ecfa6f6fb33dfeac6ca4f9b459c9', 'PF':'e62c07fde352cc658af3f989fe0b546f', 
            'Lung Opacity': '4068af795c7cb80fec0883dab82f4fbf', 'Atelectasis': '1dafb16f8c69e188cf2152200e0cb2ef', 
            'Other lesion': '53b1a490cd7e3a30e94014bdfd314d14', 'Infiltration': '1aaa4b217affae30113bd3a7a384a4c7', 
            'Pleural effusion': '04bb8bd7ee6f88a16623fe5c6dd4da91', 'Calcification': 'dfd523a5991fc852654bf1235c6282c6', 
            'Consolidation': '4b91d54f3170a9c8a757e6acd6c25588', 'Pneumothorax': 'f51434ef988e30a05f8b0986814d9485'}

In [249]:
# Massive ambiguity is present within the dataset. Even though each image is annotated by only 3 radiologists, 
# there does not seem to be any consistency as to how many times each radiologist can label each image. 
# As seen below, one radiologist annotated the same image 14 times. 

# To solve for this discrepency in image labeling, I'm going to take the highest value count of each images' labeling, and 
# label that image accordingly. If that proves to not be effective, taking the top 2 classes may be the next step.

temp

In [23]:
# possibility to consider for modeling, reduce any duplicate boxes to limit ambiguity. 

(temp[['x_min', 'y_min']].sort_values(by='x_min').drop_duplicates()).shape

In [24]:
# Pull images from dataframe that have highest frequency of one label name, create images from their pixel arrays, and plot 
# all bounding boxes linked to that image

### Important step to take because it visualizes the common 'look' of a condition versus another. Not only that, some 
### conditions, like 'Nodule/Mass' rarely has one to five boxes labeled, but upwards of 20+.

ims = []
bbox = []
for key,id_ in high_ims.items():

    dicom_path = train_img + id_ + '.dicom'
    dicom = pydicom.dcmread(dicom_path)
    
    data = dicom.pixel_array
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    im = Image.fromarray(data)
    
    df = train_df[train_df.image_id == id_]
    box_values = df.iloc[:,4:]
    for v in box_values.values:
        xy = (v[0], v[1])
        width = (v[2] - v[0])
        height = (v[3] - v[1])
        box = patches.Rectangle((xy), width=width, height=height, edgecolor='white', fill=False)
        bbox.append(box)
    fig, ax = plt.subplots(1)
    plt.figure(figsize=(20,20))
    for b in bbox:
        ax.add_patch(b)
    ax.set_title(str(key))
    ax.set_xticks([])
    ax.set_yticks([])
    ax.imshow(im)
    bbox = []