# Notebook Objective - Data Preparation
In this notebook we filter out the lung area (roi) of the images by applying their respective masks to them. Then we save them in the data/preprocessed folder

In [1]:
import os
import cv2
import numpy as np

def rewrite_roi_and_raw_images(source_dir, target_dir, target_shape=None, overwrite=False):
    '''
    This function loads the images and masks of each class folder in source_dir and applies the mask to the image.
    If target_shape is given then the image will be resized to that shape, otherwise it will be resized to the mask shape.
    Afterwards the resized and filtered image ("roi") as well as the resized and unfiltered image ("raw") are being saved in the target_dir
    If overwrite is False then it will be checked whether that file already exists in raw/roi subdirectory
    '''
    class_folders = next(os.walk(source_dir))[1]

    for folder in class_folders:
        
        img_files = os.listdir(os.path.join(source_dir, folder, 'images'))
        msk_files = os.listdir(os.path.join(source_dir, folder, 'masks'))
        
        roi_dir = os.path.join(target_dir, 'roi', folder)
        raw_dir = os.path.join(target_dir, 'raw', folder)
        
        if not os.path.isdir(roi_dir): 
            os.makedirs(roi_dir)
        if not os.path.isdir(raw_dir):
            os.makedirs(raw_dir)

        for img_file, msk_file in zip(img_files, msk_files):
        
            if not img_file == msk_file:
                print(img_file, ' does not match ', msk_file)
                continue
        
            roi_path = os.path.join(roi_dir, img_file)
            raw_path = os.path.join(raw_dir, img_file)
        
            if not overwrite and os.path.isfile(roi_path) and os.path.isfile(raw_path):
                # Skip this file if the roi and raw version already exist
                continue
        
            msk = cv2.imread(os.path.join(source_dir, folder, 'masks', msk_file), cv2.IMREAD_GRAYSCALE)
            img = cv2.imread(os.path.join(source_dir, folder, 'images', img_file), cv2.IMREAD_GRAYSCALE)

            img = cv2.resize(img, msk.shape)
            roi = cv2.bitwise_and(img, msk)
            if target_shape is not None:
                img = cv2.resize(img, target_shape)

            if overwrite or not os.path.isfile(roi_path):
                cv2.imwrite(roi_path, roi)
            if overwrite or not os.path.isfile(raw_path):
                cv2.imwrite(raw_path, img)

In [73]:
# Let's do the rewriting, takes some time

base_dir = os.path.abspath('..')
source_dir = os.path.join(base_dir, 'data', 'raw')
target_dir = os.path.join(base_dir, 'data', 'preprocessed')
rewrite_roi_and_raw_images(source_dir, target_dir, target_shape=(256, 256), overwrite=1)