In [None]:
!nvidia-smi

In [None]:
import os
import glob
import numpy as np
import pandas as pd
import nibabel as nib
import PIL.Image as Image
import matplotlib.pyplot as plt


FILE_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.dcm', '.DCM', '.raw', '.RAW', '.svs', '.SVS']
IMG_EXTENSION = ['.png', '.PNG', '.jpg', '.JPG', '.jpeg', '.JPEG']
DCM_EXTENSION = ['.dcm', '.DCM']
RAW_EXTENSION = ['.raw', '.RAW']
NIFTI_EXTENSION = ['.nii']
NP_EXTENSION = ['.npy']

common_dir = '/home/ncp/workspace/202002n050/050.신경계 질환 관련 임상 및 진료 데이터'


def check_extension(filename, extension_ls=FILE_EXTENSION):
    return any(filename.endswith(extension) for extension in extension_ls)


def load_file_path(folder_path, extension_ls=FILE_EXTENSION, all_sub_folders=False):
    """find 'IMG_EXTENSION' file paths in folder.
    
    Parameters:
        folder_path (str) -- folder directory
        extension_ls (list) -- list of extensions
    
    Return:
        file_paths (list) -- list of 'extension_ls' file paths
    """
    
    file_paths = []
    assert os.path.isdir(folder_path), f'{folder_path} is not a valid directory'

    for root, _, fnames in sorted(os.walk(folder_path)):
        for fname in fnames:
            if check_extension(fname, extension_ls):
                path = os.path.join(root, fname)
                file_paths.append(path)
        if not all_sub_folders:
            break

    return file_paths[:]


def gen_new_dir(new_dir):
    try: 
        if not os.path.exists(new_dir): 
            os.makedirs(new_dir) 
            #print(f"New directory!: {new_dir}")
    except OSError: 
        print("Error: Failed to create the directory.")
        
def get_data_fname_label_in_split(data_df, fold=0, mode='train'):
    #return data_df[data_df['split_811_new']==mode][['name', 'bad_outcome_3m']].values
    return data_df[data_df['fold_'+str(fold)]==mode][['name', 'bad_outcome_3m']].values

def get_dataset(data_df, data_dir, mask_dir, fold=0, mode='train'):
    data_fname_label_arr = get_data_fname_label_in_split(data_df, fold, mode=mode)
    dwi_path_ls = sorted(load_file_path(os.path.join(data_dir, 'dwi'), NP_EXTENSION))
    adc_path_ls = sorted(load_file_path(os.path.join(data_dir, 'adc'), NP_EXTENSION))
    np_mask_path_ls = sorted(load_file_path(mask_dir, NP_EXTENSION))
    dwi_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in dwi_path_ls}
    adc_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in adc_path_ls}
    np_mask_path_dict = {os.path.splitext(os.path.basename(p))[0]:p for p in np_mask_path_ls}
    return [[dwi_path_dict.get(fname), adc_path_dict.get(fname), np_mask_path_dict.get(fname), label] 
            for fname, label in data_fname_label_arr if np_mask_path_dict.get(fname) if adc_path_dict.get(fname) if dwi_path_dict.get(fname)]

In [None]:
def normalize(img_3d):
    img_3d = img_3d - img_3d.min()
    if img_3d.max() != 0:
        img_3d = img_3d / img_3d.max()
    #img_3d = (img_3d-img_3d.min()) / (img_3d.max()-img_3d.min())
    return img_3d.astype(np.float32)

def z_normalize(img_3d):
    return (img_3d - 0.5) / .5

def img_loader(dwipath, adcpath, maskpath):
    dwiimg = normalize(np.load(dwipath))
    adcimg = normalize(np.load(adcpath))
    #maskimg = normalize(np.load(maskpath))
    return z_normalize(np.stack([dwiimg, adcimg], axis=0))

In [None]:
data_df = pd.read_csv('/home/ncp/workspace/blocks1/aihub_df_v.1.2.csv')
data_df_da = data_df[data_df.dwi_adc == True]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
tot_fname_label = data_df_da[['name', 'bad_outcome_3m']].values
tot_fname = tot_fname_label[:,0]
tot_label = tot_fname_label[:,1]
train_fname, test_fname, train_label, test_label = train_test_split(tot_fname, 
                                                                    tot_label, 
                                                                    test_size=0.2, 
                                                                    random_state=17, 
                                                                    stratify=tot_label)

In [None]:
len(train_fname)

In [None]:
from sklearn.model_selection import KFold

In [None]:
X_tot = train_fname
Y_tot = train_label

In [None]:
kf = KFold(n_splits=8, random_state=76, shuffle=True)

In [None]:
kf.get_n_splits(X_tot)

In [None]:
fold_ = []
for train_idx, val_idx in kf.split(X_tot):
    X_train, X_val = X_tot[train_idx], X_tot[val_idx]
    fold_.append([X_train, X_val])

In [None]:
def split_train_val_test(fname, train_fname, val_fname, test_fname):
    if fname in train_fname:
        return 'train'
    elif fname in val_fname:
        return 'val'
    elif fname in test_fname:
        return 'test'
    else:
        return None

In [None]:
for idx, X_tv in enumerate(fold_):
    X_t, X_v = X_tv
    fold_n = 'fold_' + str(idx)
    data_df_da[fold_n] = data_df_da['name'].map(lambda x: split_train_val_test(x, X_t, X_v, test_fname))

In [None]:
data_df_da.to_csv('/home/ncp/workspace/blocks1/aihub_df_v.KF.csv',index=False)

In [None]:
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2

class AIHUB_GoodOutcomePredDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 dataset_dir,
                 mask_dir,
                 dataset_df, 
                 img_loader=img_loader, 
                 fold = 0,
                 mode='train'
    ):
        self.dataset_dir = dataset_dir
        self.mask_dir = mask_dir
        self.dataset_df = pd.read_csv(dataset_df)
        self.img_loader = img_loader
        self.fold = fold
        self.mode = mode
        self.dataset = get_dataset(self.dataset_df, self.dataset_dir, self.mask_dir, self.fold, self.mode)
        
    def __getitem__(self, index):
        dwi_path, adc_path, mask_path, label = self.dataset[index]
        
        image = img_loader(dwi_path, adc_path, mask_path)

        return torch.Tensor(image), label
    
    def __len__(self):
        return len(self.dataset)

In [None]:
import time
import torch

In [None]:
class EarlyStopping:
    def __init__(self, 
                 patience=7, 
                 verbose=False, 
                 delta=0, 
                 path='checkpoint.pt', 
                 trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0
    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased {self.val_loss_min:.6f} --> {val_loss:.6f}. Saving Model...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1-alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average
        
    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1),-1)
            input = input.transpose(1,2)
            input = input.contiguous().view(-1, input.size(2))
        target = target.view(-1,1)
        
        logpt = F.log_softmax(input, -1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())
        
        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * Variable(at)
        
        loss = -1*(1-pt)**self.gamma*logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [None]:
criterion = FocalLoss(alpha=0.25, gamma=2)

In [None]:
def train_epoch(model, loader, optimizer, epoch, n_epochs, print_freq=100):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()
    
    model.cuda()
    model.train()
    
    end = time.time()
    for batch_idx, (input, target) in enumerate(loader):
        if torch.cuda.is_available():
            input = input.cuda()
            target = target.cuda()
        
        output = model(input)
        loss = torch.nn.functional.cross_entropy(output, target)
        #loss = criterion(output, target)
        batch_size = target.size(0)
        _, pred = output.data.cpu().topk(1, dim=1)
        error.update(torch.ne(pred.squeeze(), target.cpu()).float().sum().item() / batch_size, batch_size)
        losses.update(loss.item(), batch_size)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_time.update(time.time() - end)
        end = time.time()
        
        if batch_idx % print_freq == 0:
            res = '\t'.join([
                'Epoch: [%d/%d]' % (epoch+1, n_epochs),
                'Iter: [%d/%d]' % (batch_idx+1, len(loader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                'Error %.4f (%.4f)' % (error.val, error.avg),
            ])
            print(res)
        
    return batch_time.avg, losses.avg, error.avg

In [None]:
def test_epoch(model, loader, print_freq=5, is_test=True):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()
    
    model.cuda()
    model.eval()
    
    end = time.time()
    with torch.no_grad():
        for batch_idx, (input, target) in enumerate(loader):
            if torch.cuda.is_available():
                input = input.cuda()
                target = target.cuda()
        
            output = model(input)
            loss = torch.nn.functional.cross_entropy(output, target)
            #loss = criterion(output, target)
        
            batch_size = target.size(0)
            _, pred = output.data.cpu().topk(1, dim=1)
            error.update(torch.ne(pred.squeeze(), target.cpu()).float().sum().item() / batch_size, batch_size)
            losses.update(loss.item(), batch_size)
        
            
        
            batch_time.update(time.time() - end)
            end = time.time()
        
            if batch_idx % print_freq == 0:
                res = '\t'.join([
                    'Test:' if is_test else 'Valid',
                    'Iter: [%d/%d]' % (batch_idx+1, len(loader)),
                    'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                    'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                    'Error %.4f (%.4f)' % (error.val, error.avg),
                ])
                print(res)
        
        return batch_time.avg, losses.avg, error.avg

In [None]:
def train(model, train_set, valid_set, test_set, save, n_epochs=300,
         batch_size=64, lr=0.0001, patience=10, save_epoch=10, seed=None):
    cnt=0
    if seed is not None:
        torch.manual_seed(seed)
        
    train_loader = torch.utils.data.DataLoader(train_set,
                                              batch_size=batch_size, drop_last=True, shuffle=True,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)
    test_loader = torch.utils.data.DataLoader(test_set,
                                              batch_size=batch_size, shuffle=False,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    if valid_set is None:
        valid_loader = None
    else:
        valid_loader = torch.utils.data.DataLoader(valid_set,
                                                  batch_size=batch_size, shuffle=False,
                                                  pin_memory=(torch.cuda.is_available()), num_workers=0)
    if torch.cuda.is_available():
        model = model.cuda()
    
    model_wrapper = model
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model_wrapper = torch.nn.DataParallel(model).cuda()
    
    optimizer = torch.optim.Adam(model_wrapper.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[0.5*n_epochs, 0.75*n_epochs],
                                                    gamma = 0.1)
    
    with open(os.path.join(save, 'results.csv'), 'w') as f:
        f.write('epoch,train_loss,train_error,valid_loss,valid_error,test_error\n')
    
    best_error = 1
    for epoch in range(n_epochs):
        _, train_loss, train_error = train_epoch(
            model=model_wrapper, 
            loader=train_loader, 
            optimizer=optimizer, 
            epoch=epoch, 
            n_epochs=n_epochs,
        )
        scheduler.step()
        _, valid_loss, valid_error = test_epoch(
            model=model_wrapper, 
            loader=valid_loader if valid_loader else test_loader, 
            is_test=(not valid_loader)
        )
        
        if valid_loader:
            if valid_error < best_error:
                best_error = valid_error
                print('New best error: %.4f' % best_error)
                torch.save(model.state_dict(), os.path.join(save, 'model_best.dat'))
                torch.save(model.state_dict(), os.path.join(save, 'model_epoch'+str(cnt).zfill(3)+'.dat'))
        else:
            if (cnt%save_epoch==0):
                #torch.save(model.state_dict(), os.path.join(save, 'model_epoch'+str(cnt).zfill(3)+'dat'))
                pass
        
        with open(os.path.join(save, 'results.csv'), 'a') as f:
            f.write('%04d,%0.6f,%0.6f,%0.5f,%0.5f,\n' % (
                (epoch+1), 
                train_loss, 
                train_error, 
                valid_loss, 
                valid_error
            ))
        cnt += 1
        
        early_stopping(valid_loss, model)
        if early_stopping.early_stop:
            print('Early stopping')
            break
    
    torch.save(model.state_dict(), os.path.join(save, 'model_final.dat'))
    
    model.load_state_dict(torch.load(os.path.join(save, 'model_final.dat')))
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model).cuda()
    test_results = test_epoch(
        model=model, 
        loader=test_loader, 
        is_test=True
    )
    
    _, _, test_error = test_results
    with open(os.path.join(save, 'results.csv'), 'a') as f:
        f.write(',,,,,%0.5f\n' % (test_error))
    print('Final test error: %.4f' % test_error)

In [None]:
def demo(save, 
         model, 
         n_epochs=300, 
         batch_size=64, 
         lr=0.0001, 
         patience=10, 
         fold=0,
         seed=None):
    train_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv',
        fold = fold,
        mode='train')
    val_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        fold = fold,
        mode='val')
    test_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv', 
        fold = fold,
        mode='test')
    
    num_params = sum(p.numel() for p in model.parameters())
    print('Total parameters: ', num_params)
    
    if not os.path.exists(save):
        os.makedirs(save)
    if not os.path.isdir(save):
        raise Exception('%s is not a dir' % save)
    
    train(model=model, train_set=train_dataset, valid_set=val_dataset, test_set=test_dataset, save=save, n_epochs=n_epochs,
         batch_size=batch_size, lr=lr, patience=patience, seed=seed)
    print('Done!')

In [None]:
GPU_NUM = 1
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

In [None]:
from models import resnet, wide_resnet, resnext, densenet

In [None]:
save_path = './DWIADC/KFold/Fold3/3DDenseNet169d_new'
gen_new_dir(save_path)
N_EPOCHS = 10000
BATCH_SIZE = 4
LR = 0.00001
PATIENCE = 10

model = densenet.densenet169(
                num_classes=2,
                spatial_size=256,
                sample_duration=20)

model.features.conv0 = torch.nn.Conv3d(2, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)

In [None]:
demo(save=save_path, 
     model=model, 
     n_epochs=N_EPOCHS, 
     batch_size=BATCH_SIZE, 
     lr=LR, 
     patience=PATIENCE, 
     fold = 3,
     seed=None)

In [None]:
from tqdm import tqdm

In [None]:
def test_acc(testloader, model, threshold=0.5):
    correct = 0
    total = 0
    output_arr = np.ones((1,2))
    label_arr = np.array([])
    pred_arr = np.array([])
    model.cuda()
    model.eval()
    
    with torch.no_grad():
        for data in tqdm(testloader):
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            output_arr = np.concatenate((output_arr, outputs.softmax(1).cpu().numpy()), axis=0)
            label_arr = np.concatenate((label_arr, labels.cpu().numpy()), axis=0)
            pred_arr = np.concatenate((pred_arr, predicted.cpu().numpy()), axis=0)
            
        output_arr = np.delete(output_arr, 0, axis=0)
        acc = correct / total
        print('Accuracy on the test images: ', (100*correct/total))
        return acc, output_arr, label_arr, pred_arr

In [None]:
fold = 3
save_path = './DWIADC/KFold/Fold3/3DDenseNet169d_new'
test_model = densenet.densenet169(
                num_classes=2,
                spatial_size=256,
                sample_duration=18)

test_model.features.conv0 = torch.nn.Conv3d(2, 64, kernel_size=(7, 7, 7), stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
#model.features.norm5 = torch.nn.BatchNorm3d(1920, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#test_model.classifier = torch.nn.Linear(in_features=7680, out_features=2, bias=True)
test_model.load_state_dict(torch.load(os.path.join(save_path, 'model_best.dat')))

In [None]:
test_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv',
        fold = fold, 
        mode='train')

test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=8, shuffle=False,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)

In [None]:
GPU_NUM = 0
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

In [None]:
train_acc, train_output_arr, train_label_arr, train_pred_arr = test_acc(test_loader, test_model)

In [None]:
import sklearn.metrics as metrics

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(train_label_arr==1, train_output_arr[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = metrics.roc_curve(train_label_arr==i, train_output_arr[:,i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
roc_auc[0]

In [None]:
plt.figure(figsize=(8,8))
plt.plot(fpr[0], tpr[0])

In [None]:
tabular_df = pd.read_csv('/home/ncp/workspace/AIHUB_dataset/df_csv_merged_v2.1.1.csv')

In [None]:
pred_lesion_area_df = pd.read_csv('/home/ncp/workspace/blocks2/pred_lesion_area_df_og.csv')

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for f_path, _, _, label in test_dataset.dataset:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*100000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_train = np.hstack([np.squeeze(tabular_info_arr), train_output_arr[:,1][:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_train = train_label_arr

In [None]:
XY_train_df = pd.DataFrame(np.hstack([X_train, Y_train[:,np.newaxis]]))

In [None]:
XY_train_df.dropna(axis=0, inplace=True, how='any')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=17)
model.fit(XY_train_df.iloc[:,:6], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
import pickle

In [None]:
with open(os.path.join(save_path, 'ensemble3.pkl'), 'wb') as f:
    pickle.dump(model, f)

In [None]:
# model = LogisticRegression()
# model.fit(XY_train_df.iloc[:,:7], XY_train_df.iloc[:,7])

In [None]:
############################################################################

In [None]:
val_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv',
        fold = fold, 
        mode='val')

val_loader = torch.utils.data.DataLoader(val_dataset,
                                              batch_size=8, shuffle=False,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)

In [None]:
GPU_NUM = 0
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

In [None]:
val_acc, val_output_arr, val_label_arr, val_pred_arr = test_acc(val_loader, test_model)

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for f_path, _, _, label in val_dataset.dataset:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*100000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_val = np.hstack([np.squeeze(tabular_info_arr), val_output_arr[:,1][:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_val = val_label_arr

In [None]:
XY_val_df = pd.DataFrame(np.hstack([X_val, Y_val[:,np.newaxis]]))

In [None]:
XY_val_df.dropna(axis=0, inplace=True, how='any')

In [None]:
cldlvol_val_out_proba = model.predict_proba(XY_val_df.iloc[:,:6])

In [None]:
Y_label = XY_val_df.iloc[:,6].values

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_label==1, cldlvol_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(val_label_arr==1, val_output_arr[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
###############################

In [None]:
test_dataset = AIHUB_GoodOutcomePredDataset(
        dataset_dir='/home/ncp/workspace/blocks1/dicom_to_np_2dnorm_resample',
        mask_dir='/home/ncp/workspace/blocks1/refined_mask_resample',
        dataset_df='/home/ncp/workspace/blocks1/aihub_df_v.KF.csv',
        fold = fold, 
        mode='test')

test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=8, shuffle=False,
                                              pin_memory=(torch.cuda.is_available()), num_workers=0)

In [None]:
GPU_NUM = 0
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')

In [None]:
tes_acc, test_output_arr, test_label_arr, test_pred_arr = test_acc(test_loader, test_model)

In [None]:
tabular_info_arr = []
lesion_info_arr = []
label_arr = []
for f_path, _, _, label in test_dataset.dataset:
    fname = os.path.splitext(os.path.basename(f_path))[0]
    tabular_info = tabular_df[tabular_df.name == fname][['pre_good_mrs', 'age_cate', 'ini_nih', 'END']].values
    lesion_area_info = pred_lesion_area_df[pred_lesion_area_df.name == fname].pred_lesion_area.values[0] #*10000
    tabular_info_arr.append(tabular_info)
    lesion_info_arr.append(lesion_area_info)
    label_arr.append(label)
tabular_info_arr = np.array(tabular_info_arr)
lesion_info_arr = np.array(lesion_info_arr)
label_arr = np.array(label_arr)

In [None]:
X_test = np.hstack([np.squeeze(tabular_info_arr), test_output_arr[:,1][:,np.newaxis], lesion_info_arr[:,np.newaxis]])

In [None]:
Y_test = test_label_arr

In [None]:
XY_test_df = pd.DataFrame(np.hstack([X_test, Y_test[:,np.newaxis]]))

In [None]:
XY_test_df.dropna(axis=0, inplace=True, how='any')

In [None]:
cldlvol_test_out_proba = model.predict_proba(XY_test_df.iloc[:,:6])

In [None]:
Y_label = XY_test_df.iloc[:,6].values

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_label==1, cldlvol_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(test_label_arr==1, test_output_arr[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df.iloc[:,:4], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cl_val_out_proba = model_cd.predict_proba(XY_val_df.iloc[:,:4])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cl_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cl_test_out_proba = model_cd.predict_proba(XY_test_df.iloc[:,:4])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cl_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'clinical_data.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df.iloc[:,:5], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cldl_val_out_proba = model_cd.predict_proba(XY_val_df.iloc[:,:5])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cldl_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cldl_test_out_proba = model_cd.predict_proba(XY_test_df.iloc[:,:5])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cldl_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'ensemble1.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
model_cd = RandomForestClassifier(n_estimators=500,max_depth=5, random_state=17)
model_cd.fit(XY_train_df[[0,1,2,3,5]], XY_train_df.iloc[:,6])#XY_train_df.iloc[:,:7]

In [None]:
cl_vol_val_out_proba = model_cd.predict_proba(XY_val_df[[0,1,2,3,5]])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_val_df.iloc[:,6].values==1, cl_vol_val_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
cl_vol_test_out_proba = model_cd.predict_proba(XY_test_df[[0,1,2,3,5]])

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(XY_test_df.iloc[:,6].values==1, cl_vol_test_out_proba[:,1])
J = tpr - fpr # Youden's J statistic
idx = np.argmax(J)
best_thresh = thresholds[idx]
roc_auc = metrics.auc(fpr, tpr)
sens, spec = tpr[idx], 1-fpr[idx]
print(f"ROCAUC:\t\t\t\t\t{roc_auc}")
print(f"Best threshold(Youden's J statistic):\t{best_thresh}")
print(f"Sensitivity:\t\t\t\t{sens}")
print(f"Specificity:\t\t\t\t{spec}")

In [None]:
with open(os.path.join(save_path, 'ensemble2.pkl'), 'wb') as f:
    pickle.dump(model_cd, f)

In [None]:
val_f_path = [os.path.splitext(os.path.basename(f_path))[0] for f_path, _, _, _ in val_dataset.dataset]

In [None]:
val_prob = np.stack([val_f_path, 
                     cl_val_out_proba[:,1], 
                     val_output_arr[:,1], 
                     cldl_val_out_proba[:,1], 
                     cl_vol_val_out_proba[:,1], 
                     cldlvol_val_out_proba[:,1], 
                     XY_val_df.iloc[:,6].values], axis=-1)

In [None]:
val_prob_df = pd.DataFrame(val_prob, 
                           columns=['name', 
                                    'clinical_data', 
                                    'image_DL', 
                                    'ensemble1', 
                                    'ensemble2', 
                                    'ensemble3', 
                                    'bad_outcome_3m'])
val_prob_df.to_csv(os.path.join(save_path, 'val_prob.csv'))

In [None]:
test_f_path = [os.path.splitext(os.path.basename(f_path))[0] for f_path, _, _, _ in test_dataset.dataset]

In [None]:
test_prob = np.stack([test_f_path, 
                     cl_test_out_proba[:,1], 
                     test_output_arr[:,1], 
                     cldl_test_out_proba[:,1], 
                     cl_vol_test_out_proba[:,1], 
                     cldlvol_test_out_proba[:,1], 
                     XY_test_df.iloc[:,6].values], axis=-1)

In [None]:
test_prob_df = pd.DataFrame(test_prob, 
                           columns=['name', 
                                    'clinical_data', 
                                    'image_DL', 
                                    'ensemble1', 
                                    'ensemble2', 
                                    'ensemble3', 
                                    'bad_outcome_3m'])
test_prob_df.to_csv(os.path.join(save_path, 'test_prob.csv'))

In [None]:
import scipy.stats as st

In [None]:
def auc(X, Y):
    return 1/(len(X)*len(Y))*sum([kernel(x,y) for x in X for y in Y])

def kernel(X, Y):
    return .5 if Y==X else int(Y<X)

def structural_components(X, Y):
    V10 = [1/len(Y) * sum([kernel(x, y) for y in Y]) for x in X]
    V01 = [1/len(X) * sum([kernel(x, y) for x in X]) for y in Y]
    return V10, V01
    
def get_S_entry(V_A, V_B, auc_A, auc_B):
    return 1/(len(V_A)-1) * sum([(a-auc_A)*(b-auc_B) for a, b in zip(V_A, V_B)])

def z_score(var_A, var_B, covar_AB, auc_A, auc_B):
    return (auc_A - auc_B) / ((var_A + var_B - 2*covar_AB)**(.5))

def group_preds_by_label(preds, actual):
    X = [p for (p,a) in zip(preds, actual) if a]
    Y = [p for (p,a) in zip(preds, actual) if not a]
    return X, Y

In [None]:
#[test] A : clinical data / B : ensemble3

In [None]:
preds_A = np.array([.5,.5,.5,.5,.5,.5,.5,.5,.5,.5])
preds_B = np.array([.2,.5,.1,.4,.9,.8,.7,.5,.9,.8])
actual = np.array([0,0,0,0,1,0,1,1,1,1])

In [None]:
preds_A = cl_test_out_proba[:,1]
preds_B = en3_test_out_proba[:,1]
actual = XY_test_df.iloc[:,6].values

In [None]:
X_A, Y_A = group_preds_by_label(preds_A, actual)
X_B, Y_B = group_preds_by_label(preds_B, actual)

V_A10, V_A01 = structural_components(X_A, Y_A)
V_B10, V_B01 = structural_components(X_B, Y_B)

auc_A = auc(X_A, Y_A)
auc_B = auc(X_B, Y_B)

var_A = (get_S_entry(V_A10, V_A10, auc_A, auc_A)*1/len(V_A10)
        + get_S_entry(V_A01, V_A01, auc_A, auc_A)*1/len(V_A01))

var_B = (get_S_entry(V_B10, V_B10, auc_B, auc_B)*1/len(V_B10)
        + get_S_entry(V_B01, V_B01, auc_B, auc_B)*1/len(V_B01))

covar_AB = (get_S_entry(V_A10, V_B10, auc_A, auc_B) * 1/len(V_A10)
           + get_S_entry(V_A01, V_B01, auc_A, auc_B) * 1/len(V_A01))

z = z_score(var_A, var_B, covar_AB, auc_A, auc_B)
p = st.norm.sf(abs(z))*2

In [None]:
z

In [None]:
p

In [None]:
#[test] A : DL / B : ensemble3

In [None]:
preds_A = test_output_arr[:,1]
preds_B = en3_test_out_proba[:,1]
actual = XY_test_df.iloc[:,6].values

In [None]:
X_A, Y_A = group_preds_by_label(preds_A, actual)
X_B, Y_B = group_preds_by_label(preds_B, actual)

V_A10, V_A01 = structural_components(X_A, Y_A)
V_B10, V_B01 = structural_components(X_B, Y_B)

auc_A = auc(X_A, Y_A)
auc_B = auc(X_B, Y_B)

var_A = (get_S_entry(V_A10, V_A10, auc_A, auc_A)*1/len(V_A10)
        + get_S_entry(V_A01, V_A01, auc_A, auc_A)*1/len(V_A01))

var_B = (get_S_entry(V_B10, V_B10, auc_B, auc_B)*1/len(V_B10)
        + get_S_entry(V_B01, V_B01, auc_B, auc_B)*1/len(V_B01))

covar_AB = (get_S_entry(V_A10, V_B10, auc_A, auc_B) * 1/len(V_A10)
           + get_S_entry(V_A01, V_B01, auc_A, auc_B) * 1/len(V_A01))

z = z_score(var_A, var_B, covar_AB, auc_A, auc_B)
p = st.norm.sf(abs(z))

In [None]:
z

In [None]:
p

In [None]:
from sklearn.lin import 

In [None]:
model.feature_importances_

In [None]:
list_column=[]
list_fi=[]
for i,j in zip(XY_train_df.iloc[:,:7].columns,model.feature_importances_):
    list_column.append(i)
    list_fi.append(j)

In [None]:
df_importance=pd.DataFrame(list_column,columns=['list_column'])

In [None]:
df_importance

In [None]:
df_importance['list_fi']=list_fi

In [None]:
df_importance

In [None]:
df_importance.sort_values('list_fi',ascending=False)