# Loading/ Converting DICOM images into numpy array and save it in .npy file

In [None]:
import os
import glob

import pandas as pd
import numpy as np
from pathlib import Path

from tqdm.notebook import tqdm
import pydicom # Handle MRI images

import cv2  # OpenCV - https://docs.opencv.org/master/d6/d00/tutorial_py_root.html

### Data contains a train_label csv which list the filename and class like 

```
BraTS21ID   MGMT_value
00000          1
00002          1
00003          0
00005          1
00006          1
```

In [None]:
data_dir = Path('../input/rsna-miccai-brain-tumor-radiogenomic-classification/')

mri_types = ["FLAIR", "T1w", "T2w", "T1wCE"]
excluded_images = [109, 123, 709] # Bad images

train_df = pd.read_csv(data_dir / "train_labels.csv")
test_df = pd.read_csv(data_dir / "sample_submission.csv")
sample_submission = pd.read_csv(data_dir / "sample_submission.csv")

train_df = train_df[~train_df.BraTS21ID.isin(excluded_images)]


### Note - .dcm files contains many metadata along with image. IMage can be accessed using  .pixel_array

Folder structure 
```

rsna-miccai-brain-tumor-radiogenomic-classification
test
    00001
        FLAIR
            1.dcm
            2.dcm
            3.dcm
            ...
        T1w
            1.dcm
            2.dcm
            3.dcm
            ...
        T1wCE
            1.dcm
            2.dcm
            3.dcm
            ...
        T2w
            1.dcm
            2.dcm
            3.dcm
            ...
        ...
train 
    00000
        FLAIR
            1.dcm
            2.dcm
            3.dcm
            ...
        T1w
            1.dcm
            2.dcm
            3.dcm
            ...
        T1wCE
            1.dcm
            2.dcm
            3.dcm
            ...
        T2w
            1.dcm
            2.dcm
            3.dcm
            ...
        ...       
```

### Note

Here each 00000, 00001 is paitent id and each paitent MRI takes into 4 types techniques 
- FLAIR
- T1w
- T1wCE
- T2w

And each technique takes a 3-D images so all files 1.dcm, 2.dcm etc. of each folder when combined together create a 3-D image of Brain 

In [None]:
# REad the DICOM files using pydicom library 

def load_dicom(path, size = 224):
    ''' 
    Reads a DICOM image, standardizes so that the pixel values are between 0 and 1, then rescales to 0 and 255
    
    Not super sure if this kind of scaling is appropriate, but everyone seems to do it. 
    '''
    dicom = pydicom.read_file(path)
    # Filter out the image shape (WxH)
    data = dicom.pixel_array

    # transform data into black and white scale / grayscale
    # the dcm file is in FP16 so we need to remove min and divide by max to get value between 0-1 
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    
    # Then we can multiply value between 0-1 by 255 and get an UINT8 image 
    data = (data * 255).astype(np.uint8)
    
    # We can return image by resizing 
    return cv2.resize(data, (size, size))

In [None]:
def get_all_image_paths(brats21id, image_type, folder='train'): 
    '''
    Returns an arry of all the images of a particular type for a particular patient ID
    '''
    assert(image_type in mri_types)
    
    # Join paitent path 
    patient_path = os.path.join(
        "../input/rsna-miccai-brain-tumor-radiogenomic-classification/%s/" % folder, 
        str(brats21id).zfill(5),
    )

    # Join .dcm path like this 
    """
    ['../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-1.dcm',
    '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-2.dcm',
    '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-3.dcm',
    '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-4.dcm',
    '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-5.dcm',
    '../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/00000/FLAIR/Image-6.dcm',
    """
    paths = sorted(
        glob.glob(os.path.join(patient_path, image_type, "*")), 
        key=lambda x: int(x[:-4].split("-")[-1]),
    )
    
    # Return array of path so that we can load dcm images in loop
    return np.array(paths)

In [None]:
def get_all_images(brats21id, image_type, folder='train', size=225):
    # Load all the images that path is provided by get_all_image_paths
    return [load_dicom(path, size) for path in get_all_image_paths(brats21id, image_type, folder)]

In [None]:
# Create the Train and Test data 
def get_all_data_for_train(image_type, image_size=32):
    global train_df
    
    X = []
    y = []
    train_ids = []

    # Iterate through the train dataframe 
    for i in tqdm(train_df.index):
        # Get index of dataframe in loop so that we can use this index to access paitent id in train and label 
        x = train_df.loc[i]
        # getting all images of that paitent id in array 
        images = get_all_images(int(x['BraTS21ID']), image_type, 'train', image_size)
        # getting label of corresponding paitent id 
        label = x['MGMT_value']

        # Appending all the images (all images of a folder will create 3-D image)
        X.append(images)
        # Appending all the labels 
        y.append(label)
        # Gettig train id also 
        train_ids.append(int(x['BraTS21ID']))
        assert(len(X) == len(y))

    # returning list of images X, list of labels y, and train ids 
    return np.array(X), np.array(y), np.array(train_ids)

def get_all_data_for_test(image_type, image_size=32):
    global test_df
    
    X = []
    test_ids = []
    # Iterate through the train dataframe 
    for i in tqdm(test_df.index):
        # Get index of dataframe in loop so that we can use this index to access paitent id in test and label 
        x = test_df.loc[i]
        # getting all images of that paitent id in array 
        images = get_all_images(int(x['BraTS21ID']), image_type, 'test', image_size)
        # Appending all the images (all images of a folder will create 3-D image)
        X.append(images)
        # getting label of corresponding paitent id 
        test_ids.append(int(x['BraTS21ID']))
    # returning images X_test,  and test ids 
    return np.array(X), np.array(test_ids)

In [None]:
# Loading data set in array of a specific type of Images (like "FLAIR")
X, y, trainidt = get_all_data_for_train('FLAIR', image_size=224)
X_test, testidt = get_all_data_for_test('FLAIR', image_size=224)

In [None]:
# Saving in .npy so that we can use it directly for training and other stuffs 
with open('224_FLAIR_train_X.npy', 'wb') as f:
    np.save(f, X)
with open('224_FLAIR_train_y.npy', 'wb') as f:
    np.save(f, y)
with open('224_FLAIR_train_id.npy', 'wb') as f:
    np.save(f, trainidt)

In [None]:
with open('224_FLAIR_test_X.npy', 'wb') as f:
    np.save(f, X_test)
with open('224_FLAIR_test_id.npy', 'wb') as f:
    np.save(f, testidt)

In [None]:
X, y, trainidt = get_all_data_for_train('T1wCE', image_size=224)
X_test, testidt = get_all_data_for_test('T1wCE', image_size=224)

In [None]:
with open('224_T1wCE_train_X.npy', 'wb') as f:
    np.save(f, X)
with open('224_T1wCE_train_y.npy', 'wb') as f:
    np.save(f, y)
with open('224_T1wCE_train_id.npy', 'wb') as f:
    np.save(f, trainidt)

In [None]:
with open('224_T1wCE_test_X.npy', 'wb') as f:
    np.save(f, X_test)
with open('224_T1wCE_test_id.npy', 'wb') as f:
    np.save(f, testidt)