In [None]:
from tqdm import tqdm
import os
import copy

import numpy as np
import pandas as pd
import seaborn as sns
import PIL.Image as Image

import nibabel as nib

import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split

import timm
import torch
import segmentation_models_pytorch as smp

from util.util import *
from util.visualize import *
from data.dataset_2d import *

common_dir = '/home/ncp/workspace/202002n050/050.신경계 질환 관련 임상 및 진료 데이터'

In [None]:
!nvidia-smi

In [None]:
import os
import numpy as np
import pandas as pd
import glob

FILE_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.dcm', '.DCM', '.raw', '.RAW', '.svs', '.SVS']
IMG_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG']
DCM_EXTENSION = ['.dcm', '.DCM']
RAW_EXTENSION = ['.raw', '.RAW']
NIFTI_EXTENSION = ['.nii']
NP_EXTENSION = ['.npy']

mask_common_dir = '/home/ncp/workspace/202002n050/050.신경계 질환 관련 임상 및 진료 데이터'


def check_extension(filename, extension_ls=FILE_EXTENSION):
    return any(filename.endswith(extension) for extension in extension_ls)


def load_file_path(folder_path, extension_ls=FILE_EXTENSION, all_sub_folders=False):
    """find 'IMG_EXTENSION' file paths in folder.
    
    Parameters:
        folder_path (str) -- folder directory
        extension_ls (list) -- list of extensions
    
    Return:
        file_paths (list) -- list of 'extension_ls' file paths
    """
    
    file_paths = []
    assert os.path.isdir(folder_path), f'{folder_path} is not a valid directory'

    for root, _, fnames in sorted(os.walk(folder_path)):
        for fname in fnames:
            if check_extension(fname, extension_ls):
                path = os.path.join(root, fname)
                file_paths.append(path)
        if not all_sub_folders:
            break

    return file_paths[:]


def gen_new_dir(new_dir):
    try: 
        if not os.path.exists(new_dir): 
            os.makedirs(new_dir) 
            #print(f"New directory!: {new_dir}")
    except OSError: 
        print("Error: Failed to create the directory.")


def find_dwi_adc_mask_dir(img_folder_dir, fname):
    dwi_folder_dir = os.path.join(img_folder_dir, fname, 'dwi')
    adc_folder_dir = os.path.join(img_folder_dir, fname, 'adc')
    mask_folder_dir = os.path.join(img_folder_dir, fname, 'pred_masks')
    if (os.path.isdir(dwi_folder_dir)) & (os.path.isdir(adc_folder_dir)):
        return dwi_folder_dir, adc_folder_dir, mask_folder_dir
    else:
        return None


def get_data_fname_label_in_split(data_df, mode='train'):
    return data_df[data_df['split']==mode][['name', 'good_outcome_3m']].values


def get_dataset(data_df, data_dir, mode='train'):
    data_fname_label_arr = get_data_fname_label_in_split(data_df, mode=mode)


def pair_aihub_dwi_adc_mask_img_label_path(img_folder_dir, data_df):
    img_label_path_dict = {}
    for fname in sorted(data_df.name.values):
        dwi_adc_mask_dir = find_dwi_adc_mask_dir(img_folder_dir, fname)
        label = data_df[data_df.name==fname]['bad_outcome_3m'].values[0]
        if dwi_adc_mask_dir:
            dwi_folder_dir, adc_folder_dir, mask_folder_dir = dwi_adc_mask_dir
            dwi_path_ls = sorted(load_file_path(dwi_folder_dir, IMG_EXTENSION))
            adc_path_ls = sorted(load_file_path(adc_folder_dir, IMG_EXTENSION))
            mask_path_ls = sorted(load_file_path(mask_folder_dir, IMG_EXTENSION))
            img_path_ls = list(zip(dwi_path_ls,adc_path_ls, mask_path_ls))
            
            img_label_path_dict[fname] = [img_path_ls, label]
    return img_label_path_dict


def select_train_val_test(img_mask_path_dict, fname_list):
    tmp_dict = {}
    for fname in fname_list:
        if img_mask_path_dict.get(fname):
            tmp_dict[fname] = img_mask_path_dict.get(fname)
            
    return tmp_dict


def find_aihub_img_label_paths(img_folder_dir, data_df, fname_list):
    img_label_path_dict = pair_aihub_dwi_adc_mask_img_label_path(img_folder_dir, data_df)
    
    img_label_path_dict_sel = select_train_val_test(img_label_path_dict, fname_list)
    
    img_path_arr = np.concatenate([[*img_path_ls] for img_path_ls, _ in img_label_path_dict_sel.values()])
    label_path_arr = np.array([[label for _ in range(len(img_path_ls))] for img_path_ls, label in img_label_path_dict_sel.values()])
    return img_path_arr, np.hstack(label_path_arr)

In [None]:
def dwi_adc_mask_loader(dwi_adc_mask_path):
    dwi_path, adc_path, mask_path = dwi_adc_mask_path
    dwi_img = np.array(Image.open(dwi_path))
    adc_img = np.array(Image.open(adc_path))
    mask_img = np.array(Image.open(mask_path))
    return np.stack([dwi_img, adc_img, mask_img], axis=-1)

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import os

import numpy as np
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2


def get_training_augmentation(params=None):
    transform_list = []
    
    #transform_list.append(A.HorizontalFlip(p=.5))
    #transform_list.append(A.VerticalFlip(p=.5))
    #transform_list.append(A.ShiftScaleRotate(scale_limit=0.1, rotate_limit=5, shift_limit=0.2, border_mode=0, p=.5))
    #transform_list.append(A.ShiftScaleRotate(scale_limit=0.01, rotate_limit=5, shift_limit=0., border_mode=0, p=.5))
    
    return A.Compose(transform_list)


def get_preprocessing(params=None,resize=(256,256),convert=True):
    transform_list = []
    transform_list.append(A.Resize(*resize))
    if convert:
        #transform_list.append(A.Normalize(mean=(0.5,0.5),  std=(0.5,0.5)))
        transform_list.append(A.Normalize(mean=(0.485, 0.456, 0.406),  std=(0.229, 0.224, 0.225)))
        transform_list.append(ToTensorV2(transpose_mask=True))
    return A.Compose(transform_list)


class AIHUB_DWI_ADC_OutcomePredDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 img_folder_dir, 
                 data_df_path,
                 img_loader=dwi_adc_mask_loader, 
                 augmentation=None, 
                 preprocessing=None,
                 mode='train'
    ):
        self.data_df = pd.read_csv(data_df_path)
        self.img_loader = img_loader
        self.augmentation = augmentation
        self.preprocessing = preprocessing
        self.mode = mode
        if self.mode:
            self.fname_list = self.data_df[self.data_df["fold_3"] == self.mode].name.values
        else:
            self.fname_list = self.data_df.name.values
        
        self.img_path_arr, self.label_arr = find_aihub_img_label_paths(img_folder_dir, self.data_df, self.fname_list)
        if self.mode != 'train':
            self.augmentation = None
        
    def __getitem__(self, index):
        image = self.img_loader(self.img_path_arr[index])
        label = self.label_arr[index]
        if self.augmentation:
            sample = self.augmentation(image=image)
            image = sample['image']
        if self.preprocessing:
            sample = self.preprocessing(image=image)
            image = sample['image']
        
        return image, label
    
    def __len__(self):
        return len(self.img_path_arr)


In [None]:
train_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
    img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
    data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
    augmentation=None, 
    preprocessing=get_preprocessing(resize=(256,256)),
    mode='train'
    )

In [None]:
aug_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
    img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
    data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
    augmentation=None, 
    preprocessing=get_preprocessing(resize=(256,256), convert=False),
    mode='val',
    )

In [None]:
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.axis("off")
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

In [None]:
import numpy as np


def normalize(arr):
    tmp = (arr - arr.min())/(arr.max()-arr.min())*255
    return tmp.astype(np.uint8)


def visualize_grayscale(arr):
    tmp = normalize(arr)
    return np.stack([tmp, tmp, tmp], axis=-1)

In [None]:
# # check augmentation 
# for i in range(0,12):
#     image, mask = aug_dataset[i] 
#     visualize(image=visualize_grayscale(np.squeeze(image[:,:,1])))

In [None]:
import time
import torch

In [None]:
class EarlyStopping:
    def __init__(self, 
                 patience=7, 
                 verbose=False, 
                 delta=0, 
                 path='checkpoint.pt', 
                 trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased {self.val_loss_min:.6f} --> {val_loss:.6f}. Saving Model...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def train_epoch(model, loader, optimizer, epoch, n_epochs, print_freq=100):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()
    
    model.cuda()
    model.train()
    
    end = time.time()
    for batch_idx, (input, target) in enumerate(loader):
        if torch.cuda.is_available():
            input = input.cuda()
            target = target.cuda()
        
        output = model(input)
        loss = torch.nn.functional.cross_entropy(output, target)
        
        batch_size = target.size(0)
        _, pred = output.data.cpu().topk(1, dim=1)
        error.update(torch.ne(pred.squeeze(), target.cpu()).float().sum().item() / batch_size, batch_size)
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_time.update(time.time() - end)
        end = time.time()
        
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Epoch: [%d/%d]' % (epoch+1, n_epochs),
                'Iter: [%d/%d]' % (batch_idx+1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                'Error %.4f (%.4f)' % (error.val, error.avg),
            ])
            print(res)
        
    return batch_time.avg, losses.avg, error.avg

In [None]:
def test_epoch(model, loader, print_freq=5, is_test=True):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()
    
    model.cuda()
    model.eval()
    
    end = time.time()
    with torch.no_grad():
        for batch_idx, (input, target) in enumerate(loader):
            if torch.cuda.is_available():
                input = input.cuda()
                target = target.cuda()
        
            output = model(input)
            loss = torch.nn.functional.cross_entropy(output, target)
        
            batch_size = target.size(0)
            _, pred = output.data.cpu().topk(1, dim=1)
            error.update(torch.ne(pred.squeeze(), target.cpu()).float().sum().item() / batch_size, batch_size)
            losses.update(loss.item(), batch_size)
        
            
        
            batch_time.update(time.time() - end)
            end = time.time()
        
            if batch_idx % print_freq == 0:
                res = '\t'.join([
                    'Test:' if is_test else 'Valid',
                    'Iter: [%d/%d]' % (batch_idx+1, len(loader)),
                    'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                    'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                    'Error %.4f (%.4f)' % (error.val, error.avg),
                ])
                print(res)
        
        return batch_time.avg, losses.avg, error.avg

In [None]:
def train(model, train_set, valid_set, test_set, save, n_epochs=300,
         batch_size=64, lr=0.0001, patience=10, save_epoch=10, seed=None):
    cnt=0
    if seed is not None:
        torch.manual_seed(seed)
        
    train_loader = torch.utils.data.DataLoader(train_set,
                                              batch_size=batch_size, drop_last=True, shuffle=True,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=batch_size, shuffle=False,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    if valid_set is None:
        valid_loader = None
    else:
        valid_loader = torch.utils.data.DataLoader(valid_set,
                                                  batch_size=batch_size, shuffle=False,
                                                  pin_memory=(torch.cuda.is_available()), num_workers=0)
    if torch.cuda.is_available():
        model = model.cuda()
    
    model_wrapper = model
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model_wrapper = torch.nn.DataParallel(model).cuda()
    
    optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[0.5*n_epochs, 0.75*n_epochs],
                                                    gamma = 0.1)
    
    with open(os.path.join(save, 'results.csv'), 'w') as f:
        f.write('epoch,train_loss,train_error,valid_loss,valid_error,test_error\n')
    
    best_error = 1
    for epoch in range(n_epochs):
        _, train_loss, train_error = train_epoch(
            model=model_wrapper, 
            loader=train_loader, 
            optimizer=optimizer, 
            epoch=epoch, 
            n_epochs=n_epochs,
        )
        scheduler.step()
        _, valid_loss, valid_error = test_epoch(
            model=model_wrapper, 
            loader=valid_loader if valid_loader else test_loader, 
            is_test=(not valid_loader)
        )
        
        if valid_loader:
            if valid_error < best_error:
                best_error = valid_error
                print('New best error: %.4f' % best_error)
                torch.save(model.state_dict(), os.path.join(save, 'model_best.dat'))
        else:
            if (cnt%save_epoch==0):
                #torch.save(model.state_dict(), os.path.join(save, 'model_epoch'+str(cnt).zfill(3)+'dat'))
                pass
        
        with open(os.path.join(save, 'results.csv'), 'a') as f:
            f.write('%04d,%0.6f,%0.6f,%0.5f,%0.5f,\n' % (
                (epoch+1), 
                train_loss, 
                train_error, 
                valid_loss, 
                valid_error
            ))
        cnt += 1
        
        early_stopping(valid_loss, model)
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    torch.save(model.state_dict(), os.path.join(save, 'model_final.dat'))
    
    model.load_state_dict(torch.load(os.path.join(save, 'model_final.dat')))
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model).cuda()
    test_results = test_epoch(
        model=model, 
        loader=test_loader, 
        is_test=True
    )
    
    _, _, test_error = test_results
    with open(os.path.join(save, 'results.csv'), 'a') as f:
        f.write(',,,,,%0.5f\n' % (test_error))
    print('Final test error: %.4f' % test_error)

In [None]:
def demo(save, 
         model, 
         n_epochs=300, 
         batch_size=64, 
         lr=0.0001, 
         patience=10, 
         seed=None):
    train_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='train'
        )
    val_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='val'
        )
    test_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='test'
        )
    
    num_params = sum(p.numel() for p in model.parameters())
    print('Total parameters: ', num_params)
    
    if not os.path.exists(save):
        os.makedirs(save)
    if not os.path.isdir(save):
        raise Exception('%s is not a dir' % save)
    
    train(model=model, train_set=train_dataset, valid_set=val_dataset, test_set=test_dataset, save=save, n_epochs=n_epochs,
         batch_size=batch_size, lr=lr, patience=patience, seed=seed)
    print('Done!')

In [None]:
import timm

In [None]:
save_path = './2D/DWIADCMASKPred/2DDensenet169'
gen_new_dir(save_path)
N_EPOCHS = 10000
BATCH_SIZE = 8
LR = 0.0001
PATIENCE = 10

model = timm.create_model('densenet169',pretrained=True)
model.classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=1664, out_features=2, bias=True),
    torch.nn.Softmax(dim=1)
)

In [None]:
demo(save=save_path, 
     model=model, 
     n_epochs=N_EPOCHS, 
     batch_size=BATCH_SIZE, 
     lr=LR, 
     patience=PATIENCE, 
     seed=None)

In [None]:
save_path = './2D/DWIADCMASKPred/2DDensenet169'
test_model = timm.create_model('densenet169',pretrained=True)
test_model.classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=1664, out_features=2, bias=True),
    torch.nn.Softmax(dim=1)
)
test_model.load_state_dict(torch.load(os.path.join(save_path, 'model_best.dat')))

In [None]:
from tqdm import tqdm
import sklearn.metrics as metrics

In [None]:
train_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='train'
        )
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=False)

In [None]:
train_out_proba = []
test_model.cuda()
test_model.eval()

for data in tqdm(train_loader):
    images, labels = data
    images = images.cuda()
    masks = labels.cuda()
    pr_mask = test_model(images)
    train_out_proba.append(pr_mask.cpu().detach().numpy())

In [None]:
data_info_df = pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_v.KF.csv')

In [None]:
dataset_df = pd.read_csv('/home/ncp/workspace/blocks1/2d_slice_encoded_value_df.csv')

In [None]:
train_out_proba = np.vstack(train_out_proba)

In [None]:
i = 0
train_pred = []
train_name_and_label_ls = []
name_label_arr = data_info_df[data_info_df.fold_3 == 'train'][['name','bad_outcome_3m']].values
for name, label in name_label_arr:
    if len(dataset_df[(dataset_df.name==name)]) > 0:
        selected_slice = dataset_df[(dataset_df.name==name)]
        selected_slice_num = len(selected_slice)
        #sol = np.sum(test_out_proba[i:i+selected_slice_num][:,1])/selected_slice_num
        sol = np.mean(train_out_proba[i:i+selected_slice_num][:,1])
        #sol_std = np.std(test_out_proba[i:i+selected_slice_num][:,1])
        #sol = np.max(test_out_proba[i:i+selected_slice_num][:,1])
        i += selected_slice_num
        train_name_and_label_ls.append([name, label])
        train_pred.append(sol)

In [None]:
train_pred = np.array(train_pred)

In [None]:
tabular_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.1.csv')

In [None]:
pred_lesion_area_df = pd.read_csv('/home/ncp/workspace/blocks2/pred_lesion_area_df_og.csv')

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for fname, label in train_name_and_label_ls:
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*100000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_train = np.hstack([np.squeeze(tabular_info_arr), train_pred[:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_train = label_arr

In [None]:
XY_train_df = pd.DataFrame(np.hstack([X_train, Y_train[:,np.newaxis]]))

In [None]:
XY_train_df.dropna(axis=0, inplace=True, how='any')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=17)
model.fit(XY_train_df.iloc[:,:6], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
import pickle

In [None]:
with open(os.path.join(save_path, 'ensemble3.pkl'), 'wb') as f:
    pickle.dump(model, f)

In [None]:
val_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='val'
        )
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=4, shuffle=False)

In [None]:
val_out_proba = []
test_model.cuda()
test_model.eval()

for data in tqdm(val_loader):
    images, labels = data
    images = images.cuda()
    masks = labels.cuda()
    pr_mask = test_model(images)
    val_out_proba.append(pr_mask.cpu().detach().numpy())

In [None]:
val_out_proba = np.vstack(val_out_proba)

In [None]:
i = 0
val_pred = []
val_name_and_label_ls = []
name_label_arr = data_info_df[data_info_df.fold_3 == 'val'][['name','bad_outcome_3m']].values
for name, label in name_label_arr:
    if len(dataset_df[(dataset_df.name==name)]) > 0:
        selected_slice = dataset_df[(dataset_df.name==name)]
        selected_slice_num = len(selected_slice)
        #sol = np.sum(test_out_proba[i:i+selected_slice_num][:,1])/selected_slice_num
        sol = np.mean(val_out_proba[i:i+selected_slice_num][:,1])
        #sol_std = np.std(test_out_proba[i:i+selected_slice_num][:,1])
        #sol = np.max(test_out_proba[i:i+selected_slice_num][:,1])
        i += selected_slice_num
        val_name_and_label_ls.append([name, label])
        val_pred.append(sol)

In [None]:
val_pred = np.array(val_pred)

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for fname, label in val_name_and_label_ls:
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*100000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_val = np.hstack([np.squeeze(tabular_info_arr), val_pred[:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_val = label_arr

In [None]:
XY_val_df = pd.DataFrame(np.hstack([X_val, Y_val[:,np.newaxis]]))

In [None]:
XY_val_df.dropna(axis=0, inplace=True, how='any')

In [None]:
cldlvol_val_out_proba = model.predict_proba(XY_val_df.iloc[:,:6])

In [None]:
Y_label = XY_val_df.iloc[:,6].values

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_label==1, cldlvol_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_label==1, val_pred)
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
test_dataset = AIHUB_DWI_ADC_OutcomePredDataset(
        img_folder_dir = '/home/ncp/workspace/blocks1/dicom_to_png_2d/', 
        data_df_path = '/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        augmentation=None, 
        preprocessing=get_preprocessing(resize=(256,256)),
        mode='test'
        )
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
test_out_proba = []
test_model.cuda()
test_model.eval()

for data in tqdm(test_loader):
    images, labels = data
    images = images.cuda()
    masks = labels.cuda()
    pr_mask = test_model(images)
    test_out_proba.append(pr_mask.cpu().detach().numpy())

In [None]:
data_info_df = pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_v.KF.csv')

In [None]:
dataset_df = pd.read_csv('/home/ncp/workspace/blocks1/2d_slice_encoded_value_df.csv')

In [None]:
test_out_proba = np.vstack(test_out_proba)

In [None]:
i = 0
test_pred = []
test_name_and_label_ls = []
name_label_arr = data_info_df[data_info_df.fold_3 == 'test'][['name','bad_outcome_3m']].values
for name, label in name_label_arr:
    if len(dataset_df[(dataset_df.name==name)]) > 0:
        selected_slice = dataset_df[(dataset_df.name==name)]
        selected_slice_num = len(selected_slice)
        #sol = np.sum(test_out_proba[i:i+selected_slice_num][:,1])/selected_slice_num
        sol = np.mean(test_out_proba[i:i+selected_slice_num][:,1])
        #sol_std = np.std(test_out_proba[i:i+selected_slice_num][:,1])
        #sol = np.max(test_out_proba[i:i+selected_slice_num][:,1])
        i += selected_slice_num
        test_name_and_label_ls.append([name, label])
        test_pred.append(sol)

In [None]:
test_pred = np.array(test_pred)

In [None]:
tabular_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.1.csv')

In [None]:
pred_lesion_area_df = pd.read_csv('/home/ncp/workspace/blocks2/pred_lesion_area_df_og.csv')

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for fname, label in test_name_and_label_ls:
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*100000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_test = np.hstack([np.squeeze(tabular_info_arr), test_pred[:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_test = label_arr

In [None]:
XY_test_df = pd.DataFrame(np.hstack([X_test, Y_test[:,np.newaxis]]))

In [None]:
XY_test_df.dropna(axis=0, inplace=True, how='any')

In [None]:
cldlvol_test_out_proba = model.predict_proba(XY_test_df.iloc[:,:6])

In [None]:
Y_label = XY_test_df.iloc[:,6].values

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_label==1, cldlvol_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test==1, test_pred)
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df.iloc[:,:4], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cl_val_out_proba = model_cd.predict_proba(XY_val_df.iloc[:,:4])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cl_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cl_test_out_proba = model_cd.predict_proba(XY_test_df.iloc[:,:4])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cl_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'clinical_data.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df.iloc[:,:5], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cldl_val_out_proba = model_cd.predict_proba(XY_val_df.iloc[:,:5])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cldl_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cldl_test_out_proba = model_cd.predict_proba(XY_test_df.iloc[:,:5])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cldl_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'ensemble1.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df[[0,1,2,3,5]], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cl_vol_val_out_proba = model_cd.predict_proba(XY_val_df[[0,1,2,3,5]])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cl_vol_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cl_vol_test_out_proba = model_cd.predict_proba(XY_test_df[[0,1,2,3,5]])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cl_vol_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'ensemble2.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
val_f_path = np.array(val_name_and_label_ls)[:,0]

In [None]:
val_prob = np.stack([val_f_path, 
                     cl_val_out_proba[:,1], 
                     val_pred, 
                     cldl_val_out_proba[:,1], 
                     cl_vol_val_out_proba[:,1], 
                     cldlvol_val_out_proba[:,1], 
                     XY_val_df.iloc[:,6].values], axis=-1)

In [None]:
val_prob_df = pd.DataFrame(val_prob, 
                           columns=['name', 
                                    'clinical_data', 
                                    'image_DL', 
                                    'ensemble1', 
                                    'ensemble2', 
                                    'ensemble3', 
                                    'bad_outcome_3m'])
val_prob_df.to_csv(os.path.join(save_path, 'val_prob.csv'))

In [None]:
test_f_path = np.array(test_name_and_label_ls)[:,0]

In [None]:
test_prob = np.stack([test_f_path, 
                     cl_test_out_proba[:,1], 
                     test_pred, 
                     cldl_test_out_proba[:,1], 
                     cl_vol_test_out_proba[:,1], 
                     cldlvol_test_out_proba[:,1], 
                     XY_test_df.iloc[:,6].values], axis=-1)

In [None]:
test_prob_df = pd.DataFrame(test_prob, 
                           columns=['name', 
                                    'clinical_data', 
                                    'image_DL', 
                                    'ensemble1', 
                                    'ensemble2', 
                                    'ensemble3', 
                                    'bad_outcome_3m'])
test_prob_df.to_csv(os.path.join(save_path, 'test_prob.csv'))