In [None]:
import os
import numpy as np
import pandas as pd
import glob

from tqdm.notebook import tqdm

import PIL.Image as Image

FILE_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.dcm', '.DCM', '.raw', '.RAW', '.svs', '.SVS']
IMG_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG']
DCM_EXTENSION = ['.dcm', '.DCM']
RAW_EXTENSION = ['.raw', '.RAW']
NIFTI_EXTENSION = ['.nii']
NP_EXTENSION = ['.npy']

mask_common_dir = '/home/ncp/workspace/202002n050/050.신경계 질환 관련 임상 및 진료 데이터'


def check_extension(filename, extension_ls=FILE_EXTENSION):
    return any(filename.endswith(extension) for extension in extension_ls)


def load_file_path(folder_path, extension_ls=FILE_EXTENSION, all_sub_folders=False):
    """find 'IMG_EXTENSION' file paths in folder.
    
    Parameters:
        folder_path (str) -- folder directory
        extension_ls (list) -- list of extensions
    
    Return:
        file_paths (list) -- list of 'extension_ls' file paths
    """
    
    file_paths = []
    assert os.path.isdir(folder_path), f'{folder_path} is not a valid directory'

    for root, _, fnames in sorted(os.walk(folder_path)):
        for fname in fnames:
            if check_extension(fname, extension_ls):
                path = os.path.join(root, fname)
                file_paths.append(path)
        if not all_sub_folders:
            break

    return file_paths[:]


def gen_new_dir(new_dir):
    try: 
        if not os.path.exists(new_dir): 
            os.makedirs(new_dir) 
            #print(f"New directory!: {new_dir}")
    except OSError: 
        print("Error: Failed to create the directory.")

In [None]:
## data preprocessing ##

In [None]:
def find_aihub_img_dir(common_dir, fname, folder='train'):
    if folder == 'train':
        img_dir = os.path.join(common_dir, '01.데이터/1.Training/원천데이터', fname, 'init/image')
    elif folder == 'val':
        img_dir = os.path.join(common_dir, '01.데이터/2.Validation/원천데이터', fname, 'init/image')
    else:
        return None
    return img_dir

In [None]:
MR_SEQUENCE = {'T2' : '*ep_b0', 
               'DWI' : '*ep_b1000t', 
               'ADC' : '*ep_b0_1000'}


def find_aihub_dicom_dir(common_dir, fname, folder='train'):
    if folder == 'train':
        dicom_dir = os.path.join(common_dir, '01.데이터/1.Training/원천데이터', fname, 'init')
    elif folder == 'val':
        dicom_dir = os.path.join(common_dir, '01.데이터/2.Validation/원천데이터', fname, 'init')
    else:
        return None
    return dicom_dir


def get_aihub_case_name(common_dir, folder='train'):
    if folder=='train':
        data_dir = os.path.join(common_dir, '01.데이터/1.Training/원천데이터')
    elif folder=='val':
        data_dir = os.path.join(common_dir, '01.데이터/2.Validation/원천데이터')
        
    _fname = os.listdir(data_dir)
    _fname = [p for p in _fname if os.path.isdir(os.path.join(data_dir, p))]
    
    return _fname


def check_aihub_mr_scans(patient_folder_path, norm=True, sequence='DWI'):
    dcm_paths = sorted(load_file_path(patient_folder_path, DCM_EXTENSION))
    
    sequence_dcm_paths = [[dcm_path, pydicom.read_file(dcm_path, force=True)] for dcm_path in dcm_paths]
    sequence_dcm_paths = [p for p, s in sequence_dcm_paths if MR_SEQUENCE[sequence] == s.get('SequenceName')]
#     dcm_paths = sorted(load_file_path(patient_folder_path, DCM_EXTENSION))
    
#     sequence_dcm_paths = []
#     for dcm_path in dcm_paths:
#         tmp_info = pydicom.read_file(dcm_path, defer_size='20KB', force=True)
#         if MR_SEQUENCE[sequence] == tmp_info.get("Sequence Name"):
#             sequence_dcm_paths.append(dcm_path)
        
    return sequence_dcm_paths

In [None]:
# pre_good_mrs, age, ini_nih, END, hx_str
# good outcome -> bad outcome

In [None]:
aihub_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.csv')

In [None]:
def find_aihub_img_mask_fname(common_dir, folder='train'):
    if folder=='train':
        data_dir = os.path.join(common_dir, '01.데이터/1.Training/원천데이터')
    elif folder=='val':
        data_dir = os.path.join(common_dir, '01.데이터/2.Validation/원천데이터')
        
    _fname = os.listdir(data_dir)
    _fname = [p for p in _fname if os.path.isdir(os.path.join(data_dir, p))]
    return _fname

In [None]:
train_fname = find_aihub_img_mask_fname(common_dir, folder='train')
val_fname = find_aihub_img_mask_fname(common_dir, folder='val')
def check_folder_dir(fname):
    if fname in train_fname:
        return 'train'
    elif fname in val_fname:
        return 'val'
    else:
        return None
    
def split_train_val_test(fname):
    if fname in train_fname:
        return 'train'
    elif fname in val_fname:
        return 'val'
    elif fname in test_fname:
        return 'test'
    else:
        return None

In [None]:
pred_aihub_df = aihub_df[['name', 'good_outcome_3m', 'mrs_3m', 'mrs3mo', 'END', 'excel_outcome_3m', 'ini_nih']]

In [None]:
pred_aihub_df['folder'] = pred_aihub_df['name'].map(lambda x: check_folder_dir(x))

In [None]:
pred_aihub_df_clear = pred_aihub_df.copy()
pred_aihub_df_clear['mrs_3m'] = pred_aihub_df['mrs_3m'].fillna(value=pred_aihub_df.mrs3mo)
pred_aihub_df_clear = pred_aihub_df_clear[['name', 'good_outcome_3m', 'mrs_3m', 'END', 'excel_outcome_3m', 'ini_nih', 'folder']]
pred_aihub_df_clear.isna().sum()
pred_aihub_df_clear = pred_aihub_df_clear.dropna(axis=0)
pred_aihub_df_clear = pred_aihub_df_clear.astype({'good_outcome_3m' : int,
                                                  'mrs_3m' : int,
                                                 'excel_outcome_3m' : int})

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
tot_fname_label = pred_aihub_df_clear[['name', 'good_outcome_3m']].values
tot_fname = tot_fname_label[:,0]
tot_label = tot_fname_label[:,1]
train_fname, valtest_fname, train_label, valtest_label = train_test_split(tot_fname, 
                                                                    tot_label, 
                                                                    test_size=0.2, 
                                                                    random_state=77, 
                                                                    stratify=tot_label)
val_fname, test_fname, val_label, test_label = train_test_split(valtest_fname, 
                                                                valtest_label, 
                                                                test_size=0.5, 
                                                                random_state=77, 
                                                                stratify=valtest_label) 

In [None]:
def split_train_val_test(fname):
    if fname in train_fname:
        return 'train'
    elif fname in val_fname:
        return 'val'
    elif fname in test_fname:
        return 'test'
    else:
        return None

In [None]:
pred_aihub_df_clear['split_811'] = pred_aihub_df_clear['name'].map(lambda x: split_train_val_test(x))

In [None]:
pred_aihub_df_clear[pred_aihub_df_clear.split_811 == 'train']['mrs_3m'].value_counts()

In [None]:
idx_mrs_3m_9 = pred_aihub_df_clear[pred_aihub_df_clear.mrs_3m == 9].index
pred_aihub_df_clear = pred_aihub_df_clear.drop(idx_mrs_3m_9)

In [None]:
pred_aihub_df_clear.to_csv('/home/ncp/workspace/blocks1/aihub_df.csv', index=False)

In [None]:
from tqdm import tqdm

In [None]:
aihub_df = pd.read_csv('/home/ncp/workspace/blocks1/aihub_df.csv')

In [None]:
dcm_exist_list_0 = []
folder = 'val'
case_name_ls = get_aihub_case_name(common_dir, folder=folder)

In [None]:
dcm_exist_list_0 = []
folder = 'val'
case_name_ls = get_aihub_case_name(common_dir, folder=folder)
for N in tqdm(range(len(case_name_ls))):
    
    sample_case_name = case_name_ls[N]
    sample_case_png_dir = find_aihub_img_dir(common_dir, sample_case_name, folder=folder)
    sample_case_dicom_dir = find_aihub_dicom_dir(common_dir, sample_case_name, folder=folder)

    if sample_case_dicom_dir:
        if os.path.isdir(sample_case_png_dir):
            png_len = len(load_file_path(sample_case_png_dir, IMG_EXTENSION))
            sample_dicom_list = load_file_path(sample_case_dicom_dir, DCM_EXTENSION)
            sample_seq_dwi_list = check_aihub_mr_scans(sample_case_dicom_dir, sequence='DWI')
            sample_seq_adc_list = check_aihub_mr_scans(sample_case_dicom_dir, sequence='ADC')
            if (png_len == len(sample_seq_dwi_list)) & (png_len == len(sample_seq_adc_list)):
                dcm_exist_list_0.append(sample_case_name)
    else:
        pass

In [None]:
dcm_exist_list = []
folder = 'train'
case_name_ls = get_aihub_case_name(common_dir, folder=folder)
for N in tqdm(range(len(case_name_ls))):
    
    sample_case_name = case_name_ls[N]
    sample_case_png_dir = find_aihub_img_dir(common_dir, sample_case_name, folder=folder)
    sample_case_dicom_dir = find_aihub_dicom_dir(common_dir, sample_case_name, folder=folder)

    if sample_case_dicom_dir:
        if os.path.isdir(sample_case_png_dir):
            png_len = len(load_file_path(sample_case_png_dir, IMG_EXTENSION))
            sample_dicom_list = load_file_path(sample_case_dicom_dir, DCM_EXTENSION)
            sample_seq_dwi_list = check_aihub_mr_scans(sample_case_dicom_dir, sequence='DWI')
            sample_seq_adc_list = check_aihub_mr_scans(sample_case_dicom_dir, sequence='ADC')
            if (png_len == len(sample_seq_dwi_list)) & (png_len == len(sample_seq_adc_list)):
                dcm_exist_list.append(sample_case_name)
    else:
        pass

In [None]:
def check_dicom_header_sequence(fname):
    if fname in dcm_exist_list:
        return 1
    else:
        return 0

In [None]:
aihub_df['dcm_header_seq'] = aihub_df['name'].map(lambda x: check_dicom_header_sequence(x))

In [None]:
aihub_df.to_csv('/home/ncp/workspace/blocks1/3D_CNN_for_PRED/aihub_df_check_dwi.csv', index=False)

In [None]:
name_folder = aihub_df[aihub_df['dcm_header_seq'] == 0][['name', 'folder']].values

In [None]:
def split_dwi_adc_in_fname(path_ls):
    fname_path_ls = [[os.path.splitext(os.path.basename(p))[0].upper(), p] for p in path_ls]
    dwi_path_ls = [p for fname, p in fname_path_ls if 'DW' in fname]
    adc_path_ls = [p for fname, p in fname_path_ls if 'AD' in fname]
    return sorted(dwi_path_ls), sorted(adc_path_ls)

In [None]:
dicom_exist_list_2 = []

for N in tqdm(range(len(name_folder))):
    name, folder = name_folder[N]
    case_png_dir = find_aihub_img_dir(common_dir, name, folder=folder)
    case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
    if os.path.isdir(case_dicom_dir):
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        case_first_dicom_fname = os.path.splitext(os.path.basename(case_dicom_paths[0]))[0].upper()
        if 'AD' in case_first_dicom_fname:
            if os.path.isdir(case_png_dir):
                png_len = len(load_file_path(case_png_dir, IMG_EXTENSION))
                dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
                if png_len == len(dwi_path_ls):
                    dicom_exist_list_2.append(name)

In [None]:
def check_fname_dwi_adc(fname):
    if fname in dicom_exist_list_2:
        return 1
    else:
        return 0

In [None]:
aihub_df = aihub_df[['name', 'good_outcome_3m', 'mrs_3m', 'END', 'excel_outcome_3m', 'ini_nih', 'folder', 'split_811', 'split_311', 'dcm_header_seq']]

In [None]:
aihub_df['fname_dwi_adc'] = aihub_df['name'].map(lambda x: check_fname_dwi_adc(x))

In [None]:
dicom_exist_list_3 = []

for N in tqdm(range(len(name_folder))):
    name, folder = name_folder[N]
    case_png_dir = find_aihub_img_dir(common_dir, name, folder=folder)
    case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
    if os.path.isdir(case_dicom_dir):
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        case_first_dicom_fname = os.path.splitext(os.path.basename(case_dicom_paths[0]))[0].upper()
        if 'AD' in case_first_dicom_fname:
            if os.path.isdir(case_png_dir):
                png_len = len(load_file_path(case_png_dir, IMG_EXTENSION))
                dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
                if png_len == int(len(dwi_path_ls)/2):
                    
                    dicom_exist_list_3.append(name)

In [None]:
def check_fname_dwi_adc_half(fname):
    if fname in dicom_exist_list_3:
        return 1
    else:
        return 0

In [None]:
aihub_df['fname_dwi_adc_half'] = aihub_df['name'].map(lambda x: check_fname_dwi_adc_half(x))

In [None]:
aihub_df.groupby(['dcm_header_seq', 'fname_dwi_adc', 'fname_dwi_adc_half']).count()

In [None]:
aihub_df.to_csv('/home/ncp/workspace/blocks1/aihub_df_define_dcm.csv', index=False)

In [None]:
pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_define_dcm.csv')

In [None]:
aihub_df_dcm_failed = aihub_df[(aihub_df.dcm_header_seq==0)&(aihub_df.fname_dwi_adc==0)&(aihub_df.fname_dwi_adc_half==0)]

In [None]:
name_folder_dcm_fl = aihub_df_dcm_failed[['name', 'folder']].values

In [None]:
dicom_exist_list_4 = []

for N in tqdm(range(len(name_folder))):
    name, folder = name_folder[N]
    case_png_dir = find_aihub_img_dir(common_dir, name, folder=folder)
    case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
    if os.path.isdir(case_dicom_dir):
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        case_first_dicom_fname = os.path.splitext(os.path.basename(case_dicom_paths[0]))[0].upper()
        if 'DW' in case_first_dicom_fname:
            if os.path.isdir(case_png_dir):
                png_len = len(load_file_path(case_png_dir, IMG_EXTENSION))
                dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
                if png_len == int(len(dwi_path_ls)/2):
                    
                    dicom_exist_list_4.append(name)

In [None]:
def check_only_dwi_half(fname):
    if fname in dicom_exist_list_4:
        return 1
    else:
        return 0

In [None]:
aihub_df['only_dwi_half'] = aihub_df['name'].map(lambda x: check_only_dwi_half(x))

In [None]:
dicom_exist_list_5 = []

for N in tqdm(range(len(name_folder))):
    name, folder = name_folder[N]
    case_png_dir = find_aihub_img_dir(common_dir, name, folder=folder)
    case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
    if os.path.isdir(case_dicom_dir):
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        case_first_dicom_fname = os.path.splitext(os.path.basename(case_dicom_paths[0]))[0].upper()
        if 'DW' in case_first_dicom_fname:
            if os.path.isdir(case_png_dir):
                png_len = len(load_file_path(case_png_dir, IMG_EXTENSION))
                dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
                if png_len == int(len(dwi_path_ls)):
                    
                    dicom_exist_list_5.append(name)

In [None]:
def check_only_dwi(fname):
    if fname in dicom_exist_list_5:
        return 1
    else:
        return 0

In [None]:
aihub_df['only_dwi'] = aihub_df['name'].map(lambda x: check_only_dwi(x))

In [None]:
def check_fname_dwi_adc_half(fname):
    if fname in dicom_exist_list_3:
        return 1
    else:
        return 0

In [None]:
aihub_df['fname_dwi_adc_half'] = aihub_df['name'].map(lambda x: check_fname_dwi_adc_half(x))

In [None]:
name_folder_dcm_fl = aihub_df_dcm_failed[['name', 'folder']].values

In [None]:
aihub_df_dcm_failed = aihub_df[(aihub_df.dcm_header_seq==0)&(aihub_df.fname_dwi_adc==0)&(aihub_df.fname_dwi_adc_half==0)&(aihub_df.only_dwi==0)&(aihub_df.only_dwi_half==0)]

In [None]:
dcm_failed_idx = aihub_df[(aihub_df.dcm_header_seq==0)&(aihub_df.fname_dwi_adc==0)&(aihub_df.fname_dwi_adc_half==0)&(aihub_df.only_dwi==0)&(aihub_df.only_dwi_half==0)].index

In [None]:
aihub_df_wo_failed = aihub_df.drop(dcm_failed_idx)

In [None]:
aihub_df_wo_failed.to_csv('/home/ncp/workspace/blocks1/aihub_df_define_dcm_clear.csv', index=False)

In [None]:
aihub_df.to_csv('/home/ncp/workspace/blocks1/aihub_df_define_dcm.csv', index=False)

In [None]:
case_name_folder = aihub_df[['name', 'folder']].values

In [None]:
fname_dicom_dict = {}


for name, folder in tqdm(case_name_folder):
    flag=True
    if aihub_df[aihub_df.name == name].only_dwi.values == 1:
        case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        dwi_path_ls, _ = split_dwi_adc_in_fname(case_dicom_paths)
        adc_path_ls=None
    elif aihub_df[aihub_df.name == name].only_dwi_half.values == 1:
        case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        dwi_path_ls, _ = split_dwi_adc_in_fname(case_dicom_paths)
        dwi_path_ls = dwi_path_ls[:int(len(dwi_path_ls)/2)]
        adc_path_ls=None
    elif aihub_df[aihub_df.name == name].fname_dwi_adc.values == 1:
        case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
    elif aihub_df[aihub_df.name == name].fname_dwi_adc_half.values == 1:
        case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
        case_dicom_paths = sorted(load_file_path(case_dicom_dir, DCM_EXTENSION))
        dwi_path_ls, adc_path_ls = split_dwi_adc_in_fname(case_dicom_paths)
        dwi_path_ls = dwi_path_ls[:int(len(dwi_path_ls)/2)]
    elif aihub_df[aihub_df.name == name].dcm_header_seq.values == 1:
        case_dicom_dir = find_aihub_dicom_dir(common_dir, name, folder=folder)
        dwi_path_ls = check_aihub_mr_scans(case_dicom_dir, sequence='DWI')
        adc_path_ls = check_aihub_mr_scans(case_dicom_dir, sequence='ADC')
    else:
        continue
    
    fname_dicom_dict[name]= [dwi_path_ls,adc_path_ls]

In [None]:
import pickle

In [None]:
# with open('fname_dicom.pickle', 'wb') as fw:
#     pickle.dump(fname_dicom_dict, fw)
    
with open('fname_dicom.pickle', 'rb') as fr:
    fname_dicom_dict_load = pickle.load(fr)

In [None]:
with open('fname_dicom.pickle', 'rb') as fr:
    fname_dicom_dict_load = pickle.load(fr)

In [15]:
## get 2d lesion area ##

In [None]:
len(os.listdir('/home/ncp/workspace/blocks1/dicom_to_png_2d/'))

In [None]:
def read_2d_mask_3dim(mask_path_ls):
    mask_path_ls = sorted(mask_path_ls)
    return np.stack([np.array(Image.open(p)) for p in mask_path_ls], axis=0) > 0

In [None]:
common_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/'
predicted_lesion_area_ls = []
for fname in tqdm(os.listdir(common_dir)):
    mask_dir = os.path.join(common_dir, fname, 'pred_masks')
    mask_path_ls = load_file_path(mask_dir, IMG_EXTENSION)
    if len(mask_path_ls) > 0:
        mask_3d = read_2d_mask_3dim(mask_path_ls)
        
        predicted_lesion_area_ls.append([fname, np.sum(mask_3d)/(mask_3d.shape[0]*256*256)*10])

In [None]:
pred_lesion_area_df = pd.DataFrame(predicted_lesion_area_ls, columns=['name', 'pred_lesion_area'])

pred_lesion_area_df.to_csv('/home/ncp/workspace/blocks2/pred_lesion_area_df_og.csv', index=False)

In [None]:
pred_lesion_area_df.pred_lesion_area.values.max()

In [None]:
tabular_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.1.csv')

In [None]:
data_df = pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_v.1.2.csv')

In [None]:
data_df

In [None]:
tabular_info_arr = []
label_arr = []
for f_path, _, _, label in test_dataset.dataset:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END', 'hx_str']].values
    tabular_info_arr.append(tabular_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
label_arr = np.array(label_arr)