In [None]:
!nvidia-smi

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import nibabel as nib
import PIL.Image as Image
import matplotlib.pyplot as plt


FILE_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.dcm', '.DCM', '.raw', '.RAW', '.svs', '.SVS']
IMG_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG']
DCM_EXTENSION = ['.dcm', '.DCM']
RAW_EXTENSION = ['.raw', '.RAW']
NIFTI_EXTENSION = ['.nii']
NP_EXTENSION = ['.npy']

common_dir = '/home/ncp/workspace/202002n050/050.신경계 질환 관련 임상 및 진료 데이터'


def check_extension(filename, extension_ls=FILE_EXTENSION):
    return any(filename.endswith(extension) for extension in extension_ls)


def load_file_path(folder_path, extension_ls=FILE_EXTENSION, all_sub_folders=False):
    """find 'IMG_EXTENSION' file paths in folder.
    
    Parameters:
        folder_path (str) -- folder directory
        extension_ls (list) -- list of extensions
    
    Return:
        file_paths (list) -- list of 'extension_ls' file paths
    """
    
    file_paths = []
    assert os.path.isdir(folder_path), f'{folder_path} is not a valid directory'

    for root, _, fnames in sorted(os.walk(folder_path)):
        for fname in fnames:
            if check_extension(fname, extension_ls):
                path = os.path.join(root, fname)
                file_paths.append(path)
        if not all_sub_folders:
            break

    return file_paths[:]


def gen_new_dir(new_dir):
    try: 
        if not os.path.exists(new_dir): 
            os.makedirs(new_dir) 
            #print(f"New directory!: {new_dir}")
    except OSError: 
        print("Error: Failed to create the directory.")
        
def get_data_fname_label_in_split(data_df, mode='train'):
    return data_df[data_df['split_811']==mode][['name', 'bad_outcome_3m']].values


def get_dataset(data_df, data_dir, mask_dir, mode='train'):
    data_fname_label_arr = get_data_fname_label_in_split(data_df, mode=mode)
    dwi_path_ls = sorted(load_file_path(os.path.join(data_dir, 'dwi'), NP_EXTENSION))
    adc_path_ls = sorted(load_file_path(os.path.join(data_dir, 'adc'), NP_EXTENSION))
    np_mask_path_ls = sorted(load_file_path(mask_dir, NP_EXTENSION))
    dwi_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in dwi_path_ls}
    adc_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in adc_path_ls}
    np_mask_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in np_mask_path_ls}
    return [[dwi_path_dict.get(fname), adc_path_dict.get(fname), np_mask_path_dict.get(fname), label] 
            for fname, label in data_fname_label_arr if np_mask_path_dict.get(fname) if adc_path_dict.get(fname) if dwi_path_dict.get(fname)]

In [None]:
data_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample'
mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample'
data_df=pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_new.csv')
train_dataset_path = np.array(get_dataset(data_df, data_dir, mask_dir, mode='train'))
val_dataset_path = np.array(get_dataset(data_df, data_dir, mask_dir, mode='val'))
test_dataset_path = np.array(get_dataset(data_df, data_dir, mask_dir, mode='test'))
all_dataset_path = np.concatenate([train_dataset_path[:,0], val_dataset_path[:,0], test_dataset_path[:,0]])
dataset_fname_list = [os.path.splitext(os.path.basename(p))[0] for p in all_dataset_path]

In [None]:
data_df['dwi_adc'] = data_df['name'].map(lambda x: True if x in dataset_fname_list else False)

In [None]:
#data_df.to_csv('/home/ncp/workspace/blocks1/aihub_df_v.1.1.csv', index=False)

In [None]:
data_df[(data_df.dwi_adc == True) & (data_df.split_811 == 'train')].bad_outcome_3m.value_counts()

In [None]:
data_df[(data_df.dwi_adc == True) & (data_df.split_811 == 'val')].bad_outcome_3m.value_counts()

In [None]:
data_df[(data_df.dwi_adc == True) & (data_df.split_811 == 'test')].bad_outcome_3m.value_counts()

In [None]:
data_df[(data_df.dwi_adc == True) & (data_df.split_811 != 'train')]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
val_test_info = data_df[(data_df.dwi_adc == True) & (data_df.split_811 != 'train')][['name', 'bad_outcome_3m']].values
tot_fname = val_test_info[:,0]
tot_label = val_test_info[:,1]

val_fname, test_fname, val_label, test_label = train_test_split(tot_fname, 
                                                                tot_label, 
                                                                test_size=0.5, 
                                                                random_state=17, # 17
                                                                stratify=tot_label) 

In [None]:
'brain_mri_2013-3663' in test_fname

In [None]:
val_test_info

In [None]:
len(val_fname), len(test_fname)

In [None]:
def split_val_test(fname, tmp, val_fname, test_fname):
    if fname in val_fname:
        return 'val'
    elif fname in test_fname:
        return 'test'
    
    return tmp

In [None]:
fname

In [None]:
split_val_test(fname, 'train', val_fname, test_fname)

In [None]:
data_df.name.values[20] in test_fname

In [None]:
data_df['split_811_new'] = data_df.apply(lambda x: split_val_test(x['name'], x['split_811'], val_fname, test_fname), axis=1)

In [None]:
data_df

In [None]:
data_df[(data_df.split_811 == 'val') & (data_df.split_811_new == 'test')].name.values

In [None]:
check_case = ['brain_mri_2013-0184', 
              'brain_mri_2013-0515', 
              'brain_mri_2013-0079', 
              'brain_mri_2013-3391', 
              'brain_mri_2013-0041', 
              'brain_mri_2013-1318', 
              'brain_mri_2013-0999', 
              'brain_mri_2013-3481']

In [None]:
data_df[data_df.name == 'brain_mri_2013-3669']

In [None]:
def sample_stack(stack, rows=6, cols=6, start_with=0, show_every=1):
    try:
        fig,ax = plt.subplots(rows,cols,figsize=[18,20])
        for i in range(rows*cols):
            ind = start_with + i*show_every
            ax[int(i/cols), int(i%cols)].set_title(f'slice {ind}')

            ax[int(i/cols), int(i%cols)].imshow(stack[ind],cmap='gray', vmin=0, vmax=255)#, vmin=0, vmax=255
            ax[int(i/cols), int(i%cols)].axis('off')
        plt.show()
    except:
        pass

In [None]:
os.listdir('/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample')

In [None]:
#data_df.to_csv('/home/ncp/workspace/blocks1/aihub_df_v.1.2.csv', index=False)

In [None]:
tabular_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.1.csv')

In [None]:
train_info = data_df[(data_df.dwi_adc == True) & (data_df.split_811_new == 'train')][['name', 'bad_outcome_3m']].values
val_info = data_df[(data_df.dwi_adc == True) & (data_df.split_811_new == 'val')][['name', 'bad_outcome_3m']].values
test_info = data_df[(data_df.dwi_adc == True) & (data_df.split_811_new == 'test')][['name', 'bad_outcome_3m']].values

In [None]:
tabular_info_arr = []
label_arr = []
for f_path, label in train_info:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END', 'hx_str']].values
    tabular_info_arr.append(tabular_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_train = np.squeeze(tabular_info_arr)
Y_train = label_arr
XY_train_df = pd.DataFrame(np.hstack([X_train, Y_train[:,np.newaxis]]))
XY_train_df.dropna(axis=0, inplace=True, how='any')

In [None]:
tabular_info_arr = []
label_arr = []
for f_path, label in val_info:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END', 'hx_str']].values
    tabular_info_arr.append(tabular_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_val = np.squeeze(tabular_info_arr)
Y_val = label_arr
XY_val_df = pd.DataFrame(np.hstack([X_val, Y_val[:,np.newaxis]]))
XY_val_df.dropna(axis=0, inplace=True, how='any')

In [None]:
os.path.splitext(os.path.basename('ksssd.ns'))[0]

In [None]:
tabular_info_arr = []
label_arr = []
for f_path, label in test_info:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END', 'hx_str']].values
    tabular_info_arr.append(tabular_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_test = np.squeeze(tabular_info_arr)
Y_test = label_arr
XY_test_df = pd.DataFrame(np.hstack([X_test, Y_test[:,np.newaxis]]))
XY_test_df.dropna(axis=0, inplace=True, how='any')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
model_rf_only_ci = RandomForestClassifier(n_estimators=500,max_depth=5)
model_rf_only_ci.fit(XY_train_df.iloc[:,:5], XY_train_df.iloc[:,5])

In [None]:
out_proba = model_rf_only_ci.predict_proba(XY_test_df.iloc[:,:5])

In [None]:
from sklearn import metrics 

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test==1, out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
out_proba = model_rf_only_ci.predict_proba(XY_val_df.iloc[:,:5])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_val==1, out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")