In [1]:
import sys
import re
import gc
sys.path.append("/nfs/nas-7.1/ckwu/mtl-icda-ht")

import json
import pickle
from pathlib import Path
from argparse import Namespace

import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from scipy.stats import ttest_rel
from transformers import AutoTokenizer

from utilities.data import MedicalNERIOBDataset, MedicalDxNERIOBDataset, convert_icds_to_indices, split_by_div
from utilities.model import BertNERModel, BertDxNERModel, encoder_names_mapping
from utilities.utils import move_bert_input_to_device, set_seeds
from utilities.evaluation import predict_whole_set_ner, ids_to_iobs, calc_seqeval_metrics, get_top_k_accuracies, get_evaluations

import seqeval.metrics
from seqeval.scheme import IOB2

  from .autonotebook import tqdm as notebook_tqdm


## k-Fold Cross Validation

### Prerequisite

In [None]:
"""
    Configuration
"""
config = json.loads(Path("./dx_ner_config.json").read_bytes())
args = Namespace(**config)
set_seeds(args.seed)

"""
    Data
"""
emrs = pickle.loads(Path(args.emr_path).read_bytes())
icds = pickle.loads(Path(args.dx_path).read_bytes())
icd_ids = convert_icds_to_indices(icds, full_code=args.fc)
ner_spans_l = pickle.loads(Path(args.ner_spans_l_path).read_bytes())
data_l = [emrs, icd_ids, ner_spans_l]

"""
    Others
"""
tokenizer = AutoTokenizer.from_pretrained(encoder_names_mapping[args.tokenizer])
NUM_DX_LABELS = 97

### Eval Loop

In [None]:
seqeval_d = {
    "token_acc": list(),
    "precision": list(),
    "recall": list(),
    "f1-score": list()
}

for k in range(args.fold):
    print(f"Start evaluating fold = {k}:\n")
    # Config
    args.ckpt_path = re.sub(pattern=r"remainder\-\d", repl=f"remainder-{k}", string=args.ckpt_path)
    args.ckpt_path
    # Data
    valid_emrs, valid_dxs, valid_ners = [split_by_div(data, args.fold, remainder=k, mode="valid") for data in data_l]
    valid_set = MedicalDxNERIOBDataset(valid_emrs, valid_dxs, valid_ners, tokenizer)
    valid_loader = DataLoader(valid_set, args.bs, shuffle=False, pin_memory=True, collate_fn=valid_set.collate_fn)

    # Model
    model = BertDxNERModel(
        encoder=encoder_names_mapping[args.encoder],
        dx_label_size=NUM_DX_LABELS,
        ner_label_size=valid_set.num_ner_labels,
        loss_weights=args.lw
    )
    model.load_state_dict(torch.load(args.ckpt_path, map_location=args.device))

    # Evaluation
    y_pred_raw, y_true_raw = predict_whole_set_ner(model, valid_loader, args.device)
    y_pred, y_true = ids_to_iobs(y_pred_raw, y_true_raw, valid_set)
    token_acc, p, r, f1 = calc_seqeval_metrics(y_true, y_pred)
    for key, value in zip(seqeval_d.keys(), [token_acc, p, r, f1]):
        seqeval_d[key].append(value)

    del valid_emrs, valid_ners, valid_set, valid_loader, model, y_pred_raw, y_true_raw, y_pred, y_true, token_acc, p, r, f1
    gc.collect()

### Combine Eval Results

In [None]:
seqeval_df = pd.DataFrame(seqeval_d)
seqeval_df

seqeval_mean_df = seqeval_df.mean(axis=0).to_frame().T.rename({0: "mean"})
seqeval_std_df = seqeval_df.std(axis=0).to_frame().T.rename({0: "std"})
seqeval_mean_std_df = pd.concat(objs=[seqeval_mean_df, seqeval_std_df])

seqeval_mean_std_df

### Save Evaluation Results

In [None]:
args.eval_save_dir = Path(args.eval_save_dir)
args.eval_save_dir.mkdir(parents=True, exist_ok=True)

seqeval_mean_std_df.to_csv(args.eval_save_dir / f"{args.encoder}_seqeval_mean_std_df.csv", index_label="index")

### Load Evaluation Results

In [None]:
to_load = Path("/nfs/nas-7.1/ckwu/mtl-icda-ht/comparisons/eval_results/dx_ner/BioBERT_seqeval_mean_std_df.csv")

pd.read_csv(to_load, index_col="index")

## Compare Lexical Matching with Neural NER

### Load Data

In [2]:
emrs = pickle.loads(Path("/nfs/nas-7.1/ckwu/datasets/emr/6000/emrs_with_annots.pickle").read_bytes())
ner_spans_l = pickle.loads(Path("/nfs/nas-7.1/ckwu/datasets/emr/6000/ner_spans_l.pickle").read_bytes())
ahocora_spans_l = pickle.loads(Path("/nfs/nas-7.1/ckwu/datasets/emr/6000/ahocora_spans_l_minlen-3.pickle").read_bytes())

data_l = [emrs, ner_spans_l, ahocora_spans_l]

tokenizer = AutoTokenizer.from_pretrained(encoder_names_mapping["BERT"])

In [3]:
from tqdm import tqdm

def extract_ner_labels(data_loader):
    y_raw = list()
    for _, y in tqdm(data_loader):
        y_raw.append(y.detach().cpu().tolist())
    return y_raw

In [4]:
folds = 10
seqeval_d = {
    "token_acc": list(),
    "precision": list(),
    "recall": list(),
    "f1-score": list()
}

for k in range(folds):
    print(f"Start evaluating fold = {k}:\n")
    # Data
    valid_emrs, valid_ners, valid_ahos = [split_by_div(data, folds, remainder=k, mode="valid") for data in data_l]
    ner_set = MedicalNERIOBDataset(valid_emrs, valid_ners, tokenizer)
    aho_set = MedicalNERIOBDataset(valid_emrs, valid_ahos, tokenizer)
    ner_loader = DataLoader(ner_set, batch_size=16, shuffle=False, pin_memory=True, collate_fn=ner_set.collate_fn)
    aho_loader = DataLoader(aho_set, batch_size=16, shuffle=False, pin_memory=True, collate_fn=ner_set.collate_fn)

    # # Evaluation
    y_pred_raw, y_true_raw = extract_ner_labels(aho_loader), extract_ner_labels(ner_loader)
    y_pred, y_true = ids_to_iobs(y_pred_raw, y_true_raw, ner_set)
    token_acc, p, r, f1 = calc_seqeval_metrics(y_true, y_pred)
    for key, value in zip(seqeval_d.keys(), [token_acc, p, r, f1]):
        seqeval_d[key].append(value)

    del valid_emrs, valid_ners, valid_ahos, ner_set, aho_set, ner_loader, aho_loader, y_pred_raw, y_true_raw, y_pred, y_true, token_acc, p, r, f1
    gc.collect()

Start evaluating fold = 0:



100%|██████████| 37/37 [00:02<00:00, 18.09it/s]
100%|██████████| 37/37 [00:00<00:00, 90.99it/s]


Start evaluating fold = 1:



100%|██████████| 37/37 [00:00<00:00, 94.94it/s]
100%|██████████| 37/37 [00:00<00:00, 93.92it/s]


Start evaluating fold = 2:



100%|██████████| 37/37 [00:00<00:00, 94.75it/s]
100%|██████████| 37/37 [00:00<00:00, 93.90it/s]


Start evaluating fold = 3:



100%|██████████| 37/37 [00:00<00:00, 94.68it/s]
100%|██████████| 37/37 [00:00<00:00, 93.57it/s]


Start evaluating fold = 4:



100%|██████████| 37/37 [00:00<00:00, 94.03it/s]
100%|██████████| 37/37 [00:00<00:00, 93.15it/s]


Start evaluating fold = 5:



100%|██████████| 37/37 [00:00<00:00, 92.45it/s]
100%|██████████| 37/37 [00:00<00:00, 93.39it/s]


Start evaluating fold = 6:



100%|██████████| 37/37 [00:00<00:00, 93.33it/s]
100%|██████████| 37/37 [00:00<00:00, 94.44it/s]


Start evaluating fold = 7:



100%|██████████| 37/37 [00:00<00:00, 90.19it/s]
100%|██████████| 37/37 [00:00<00:00, 93.56it/s]


Start evaluating fold = 8:



100%|██████████| 37/37 [00:00<00:00, 90.63it/s]
100%|██████████| 37/37 [00:00<00:00, 91.29it/s]


Start evaluating fold = 9:



100%|██████████| 37/37 [00:00<00:00, 91.21it/s]
100%|██████████| 37/37 [00:00<00:00, 90.96it/s]


In [5]:
seqeval_df = pd.DataFrame(seqeval_d)
seqeval_df

seqeval_mean_df = seqeval_df.mean(axis=0).to_frame().T.rename({0: "mean"})
seqeval_std_df = seqeval_df.std(axis=0).to_frame().T.rename({0: "std"})
seqeval_mean_std_df = pd.concat(objs=[seqeval_mean_df, seqeval_std_df])

seqeval_mean_std_df

Unnamed: 0,token_acc,precision,recall,f1-score
mean,0.892088,0.689962,0.649427,0.669059
std,0.001456,0.004565,0.00729,0.004636
