# RSNA Pneumonia Detection Challenge

for more detail, please visit:
https://www.kaggle.com/c/rsna-pneumonia-detection-challenge

In [1]:
import os 
import sys

import random
import math
import numpy as np
import cv2
import matplotlib.pyplot as plt
import json
import pydicom as dcm
from imgaug import augmenters as iaa
from tqdm import tqdm
import pandas as pd 
import glob

In [2]:
# from mrcnn.config import Config
from mrcnn import utils
# import mrcnn.model as modellib
# from mrcnn import visualize
# from mrcnn.model import log

  from ._conv import register_converters as _register_converters


In [3]:
# a=os.getcwd()
# a

# Root directory of the project# Root  
ROOT_DIR = os.path.abspath('C:\\Users\\su_hang\\Kaggle')
# Directory to save logs and trained model
# MODEL_DIR = os.path.join(ROOT_DIR, 'logs')

In [4]:
train_dicom_dir  = os.path.join(ROOT_DIR, 'data\\stage_1_train_images')
test_dicom_dir = os.path.join(ROOT_DIR, 'data\\stage_1_test_images')

## Some setup functions and classes for Mask-RCNN
* dicom_fps is a list of the dicom image path and filenames
* image_annotions is a dictionary of the annotations keyed by the filenames
* parsing the dataset returns a list of the image filenames and the annotations dictionary

In [5]:
def get_dicom_fps(dicom_dir):
    dicom_fps = glob.glob(dicom_dir+'/'+'*.dcm')
    return list(set(dicom_fps))

def parse_dataset(dicom_dir, anns): 
    image_fps = get_dicom_fps(dicom_dir)
    image_annotations = {fp: [] for fp in image_fps}
    for index, row in anns.iterrows(): 
        fp = os.path.join(dicom_dir, row['patientId']+'.dcm')
        image_annotations[fp].append(row)
    return image_fps, image_annotations

In [6]:
class DetectorDataset(utils.Dataset):
    """Dataset class for training pneumonia detection on the RSNA pneumonia dataset.
    """

    def __init__(self, image_fps, image_annotations, orig_height, orig_width):
        super().__init__(self)
        
        # Add classes
        self.add_class('pneumonia', 1, 'Lung Opacity')
   
        # add images 
        for i, fp in enumerate(image_fps):
            annotations = image_annotations[fp]
            self.add_image('pneumonia', image_id=i, path=fp, 
                           annotations=annotations, orig_height=orig_height, orig_width=orig_width)
            
    def image_reference(self, image_id):
        info = self.image_info[image_id]
        return info['path']

    def load_image(self, image_id):
        info = self.image_info[image_id]
        fp = info['path']
        ds = pydicom.read_file(fp)
        image = ds.pixel_array
        # If grayscale. Convert to RGB for consistency.
        if len(image.shape) != 3 or image.shape[2] != 3:
            image = np.stack((image,) * 3, -1)
        return image

    def load_mask(self, image_id):
        info = self.image_info[image_id]
        annotations = info['annotations']
        count = len(annotations)
        if count == 0:
            mask = np.zeros((info['orig_height'], info['orig_width'], 1), dtype=np.uint8)
            class_ids = np.zeros((1,), dtype=np.int32)
        else:
            mask = np.zeros((info['orig_height'], info['orig_width'], count), dtype=np.uint8)
            class_ids = np.zeros((count,), dtype=np.int32)
            for i, a in enumerate(annotations):
                if a['Target'] == 1:
                    x = int(a['x'])
                    y = int(a['y'])
                    w = int(a['width'])
                    h = int(a['height'])
                    mask_instance = mask[:, :, i].copy()
                    cv2.rectangle(mask_instance, (x, y), (x+w, y+h), 255, -1)
                    mask[:, :, i] = mask_instance
                    class_ids[i] = 1
        return mask.astype(np.bool), class_ids.astype(np.int32)

## Examine the annotation data, parse the dataset, and view dicom fields

In [7]:
# Parsing Tabular Data
labels = pd.read_csv('data/stage_1_train_labels.csv')
details = pd.read_csv('data/stage_1_detailed_class_info.csv')
# duplicates in details just have the same class so can be safely dropped
details = details.drop_duplicates('patientId').reset_index(drop=True)
labels_w_class = labels.merge(details, how='inner', on='patientId')

# this is the one we need!!*****************************************
labels_w_class_new=labels_w_class[['patientId','Target','class']]

In [8]:
# Parsing Metadata from DICOM Object:

# get lists of all train/test dicom filepaths
train_dcm_fps = glob.glob('data/stage_1_train_images/*.dcm')
test_dcm_fps = glob.glob('data/stage_1_test_images/*.dcm')

# read each file into a list (using stop_before_pixels to avoid reading the image for speed and memory savings)
train_dcms = [dcm.read_file(x, stop_before_pixels=True) for x in train_dcm_fps]
test_dcms = [dcm.read_file(x, stop_before_pixels=True) for x in test_dcm_fps]

In [9]:
def parse_dcm_metadata(dcm):
    unpacked_data = {}
    group_elem_to_keywords = {}
    # iterating here to force conversion from lazy RawDataElement to DataElement
    for d in dcm:
        pass
    # keys are pydicom.tag.BaseTag, values are pydicom.dataelem.DataElement
    for tag, elem in dcm.items():
        tag_group = tag.group
        tag_elem = tag.elem
        keyword = elem.keyword
        group_elem_to_keywords[(tag_group, tag_elem)] = keyword
        value = elem.value
        unpacked_data[keyword] = value
    return unpacked_data, group_elem_to_keywords

train_meta_dicts, tag_to_keyword_train = zip(*[parse_dcm_metadata(x) for x in train_dcms])
test_meta_dicts, tag_to_keyword_test = zip(*[parse_dcm_metadata(x) for x in test_dcms])

In [10]:
# join all the dicts
unified_tag_to_key_train = {k:v for dict_ in tag_to_keyword_train for k,v in dict_.items()}
unified_tag_to_key_test = {k:v for dict_ in tag_to_keyword_test for k,v in dict_.items()}

# quick check to make sure there are no different keys between test/train
assert len(set(unified_tag_to_key_test.keys()).symmetric_difference(set(unified_tag_to_key_train.keys()))) == 0

tag_to_key = {**unified_tag_to_key_test, **unified_tag_to_key_train}

In [11]:
# using from_records here since some values in the dicts will be iterables and some are constants
train_df = pd.DataFrame.from_records(data=train_meta_dicts)
test_df = pd.DataFrame.from_records(data=test_meta_dicts)

In [12]:
train_df_new=train_df[['PatientID','PatientAge','PatientSex','ViewPosition']]
train_df_new['PixelSpacing_x'] = train_df['PixelSpacing'].apply(lambda x: x[0])
train_df_new.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PatientID,PatientAge,PatientSex,ViewPosition,PixelSpacing_x
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,51,F,PA,0.143
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,48,F,PA,0.194311
2,00322d4d-1c29-4943-afc9-b6754be640eb,19,M,AP,0.168


In [23]:
# merge clean metadata with our initial tabular data
labels_w_class_new = labels_w_class_new.drop_duplicates('patientId').reset_index(drop=True)
df_train = train_df_new.merge(labels_w_class_new, how='left', left_on='PatientID', right_on='patientId')
df_train['PatientAge'] = df_train['PatientAge'].astype(int)
df_train.head()

Unnamed: 0,PatientID,PatientAge,PatientSex,ViewPosition,PixelSpacing_x,patientId,Target,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,51,F,PA,0.143,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,48,F,PA,0.194311,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,19,M,AP,0.168,00322d4d-1c29-4943-afc9-b6754be640eb,0,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,28,M,PA,0.143,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,32,F,AP,0.139,00436515-870c-4b36-a041-de91049b9ab4,1,Lung Opacity


In [25]:
df=df_train.drop(['patientId'],axis=1)
df.head()

Unnamed: 0,PatientID,PatientAge,PatientSex,ViewPosition,PixelSpacing_x,Target,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,51,F,PA,0.143,0,No Lung Opacity / Not Normal
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,48,F,PA,0.194311,0,No Lung Opacity / Not Normal
2,00322d4d-1c29-4943-afc9-b6754be640eb,19,M,AP,0.168,0,No Lung Opacity / Not Normal
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,28,M,PA,0.143,0,Normal
4,00436515-870c-4b36-a041-de91049b9ab4,32,F,AP,0.139,1,Lung Opacity


In [35]:
df.to_csv('train data with no pixel.csv',index=False)

In [30]:
test_df_new=test_df[['PatientID','PatientAge','PatientSex','ViewPosition']]
test_df_new.head()

Unnamed: 0,PatientID,PatientAge,PatientSex,ViewPosition
0,000924cf-0f8d-42bd-9158-1af53881a557,19,F,AP
1,000db696-cf54-4385-b10b-6b16fbb3f985,25,F,AP
2,000fe35a-2649-43d4-b027-e67796d412e0,40,M,AP
3,001031d9-f904-4a23-b3e5-2c088acd19c6,57,M,PA
4,0010f549-b242-4e94-87a8-57d79de215fc,56,M,PA


In [34]:
test_df_new.to_csv('test data with no pixel.csv',index=False)