In [15]:
import pickle 
import numpy as np
import pandas as pd
import os
import multiprocessing as mp
from multiprocessing import Pool
from datetime import datetime,timedelta
from tqdm import tqdm
import pydicom
from functools import partial
import cv2
import import_ipynb
import config
import utils
def extract_metadata_from_images(file_path):
    '''
        Extracts metadata present in DICOM file
        Args: 
            file :  DICOM file path
        Returns:
                a dictionary containing important metadata 
    '''
    dataset = pydicom.dcmread(file_path)
    d = {}
    d['ImageID'] = os.path.splitext(file_path.split('/')[-1])[0] # Exculding the format (.dcm)
    d['patientID'] = dataset.PatientID
    d['age'] = dataset.PatientAge
    d['sex'] = dataset.PatientSex
    d['view_position'] = dataset.ViewPosition
    d['pixel_spacing']  = dataset.PixelSpacing
    d["modality"] = dataset.Modality
    d["body_part_examined"] =  dataset.BodyPartExamined
    d['pixel_mean'] =  np.mean(dataset.pixel_array)
    d['pixel_min'] =  np.min(dataset.pixel_array)
    d['pixel_max'] =  np.max(dataset.pixel_array)
    return d
def generate_meta_data(label_path,train_files):
    '''
        Creates DataFrame containing metadata present in DICOM files 
        Args: 
            label_path :  Path of target class tabel file
            train_files :  lis of train DICOM file_paths
        Returns:
                DataFrame containing important metadata for each train file
    '''
    class_df = pd.read_csv(label_path)
    class_df.columns = ['ImageId', 'EncodedPixels']
    start = datetime.now()
    num_cores = mp.cpu_count()
    pool = Pool(num_cores)
    results = pool.map(extract_metadata_from_images,train_files)
    pool.close()
    print("Total time taken {0}".format(datetime.now() - start))
    df_meta = pd.DataFrame(results)
    # Saving it for later use
    df_meta.to_pickle('df_meta.pkl')

    df_meta = pd.read_pickle('df_meta.pkl')
    # Merging the class label file with extracted metadata
    df_meta = df_meta.merge(class_df , left_on = 'ImageID',right_on='ImageId' ,how='left')
    nan_rows = df_meta[df_meta.ImageId.isna()]
    print('There are {0} rows that does not have Class label'.format(nan_rows.shape[0]))
    # Dropping nan rows
    df_meta.dropna(subset = ['ImageId'],inplace= True)
    # Drop duplicates
    df_meta.drop_duplicates(subset = ['ImageId'],inplace=True)
    # Adding a column to differentiate rows with and without Pneumothorax
    df_meta['class_'] = 'Pneumothorax'
    df_meta.loc[df_meta['EncodedPixels'] == "-1",'class_'] = 'NotPneumothorax'
    # Casting ages as int
    df_meta['age'] = df_meta['age'].astype(int)
    return df_meta

def convert_to_png(file,folder):
    '''
        Extracts pixel info from DICOM files and converts them png image
        Args: 
            file :  DICOM file path
            folder : Target folder to generate images
        Returns:
                None
    '''
    dataset = pydicom.dcmread(file)
    img = dataset.pixel_array
    f_name = os.path.splitext(file.split('/')[-1])[0] # Exculding the format (.dcm)
    cv2.imwrite(folder + "/" + f_name + ".png", img)
    
def preprocess_files(files,folder):
    '''
        Preprocess the given list of files
        Args: 
            file :  list of DICOM file path 
            folder : Target folder to generate images
        Returns:
                None
    '''
    start = datetime.now()
    num_cores = mp.cpu_count()
    pool = Pool(num_cores)
    func = partial(convert_to_png,folder = folder) # convert_to_png has only one argument file (folder is fixed)
    results = pool.map(func,files)
    pool.close()
    print("Total time taken {0}".format(datetime.now() - start))

def preprocess_masks(df,folder):
    '''
        Preprocess the given list of files
        Args: 
            df :  DataFrame containg metadata of input data
            folder : Target folder to generate masks
        Returns:
                None
    '''
    for row in tqdm(df.itertuples()):
        image_id = str(row.ImageID)
        mask_encoded = row.EncodedPixels
        if mask_encoded != "-1":
            mask_decoded = utils.rle2mask(mask_encoded, 1024, 1024).T
        else:
            mask_decoded = np.zeros((1024, 1024))
        cv2.imwrite(folder + "/" + image_id + ".png", mask_decoded)

def pre_process(train_files,test_files,label_path = '../siim/train-rle.csv'): 
    df_meta = generate_meta_data(label_path,train_files)
    preprocess_files(train_files,'./train_png')
    preprocess_files(test_files,'./test_png')
    preprocess_masks(df_meta,'./mask_png')
    train_files_png  = sorted(utils.get_list_of_files("./train_png/"))
    test_files_png = sorted(utils.get_list_of_files("./test_png/"))
    df_temp = pd.DataFrame(train_files_png, columns =['ImagePath'])
    df_temp['ImageID'] = df_temp['ImagePath'].apply(lambda x : os.path.splitext(str(x).split('/')[-1])[0])
    df_meta = df_meta.merge(df_temp,on='ImageID',how='inner')
    df_meta['MaskPath'] = df_meta['ImagePath'].str.replace('train_png','mask_png')
    df_meta.to_pickle('df_meta_final.pkl')
    