In [1]:
import json
import numpy as np
import math
from collections import defaultdict
from enum import Enum
import csv
import pandas as pd

In [2]:
class Styles:
    VIOLET = "\033[95m"
    BLUE = "\033[94m"
    CYAN = "\033[96m"
    GREEN = "\033[92m"
    YELLOW = "\033[93m"
    RED = "\033[91m"
    END = "\033[0m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"
    

def highlight(s, color=Styles.CYAN):
    s = str(s)
    split_s = s.split(Styles.END)
    highlighted_split_s = [f"{color}{Styles.BOLD}{s}" for s in split_s]
    highlight_s = Styles.END.join(highlighted_split_s) + Styles.END
    return highlight_s


def highlight_words(s, indexes, color=Styles.CYAN):
    split_s = s.split(" ")
    for index in indexes:
        split_s[index] = highlight(split_s[index], color=color)
    return " ".join(split_s)

def flatten(t):
    return [item for sublist in t for item in sublist]

In [3]:
mimic_cxr_jpg_df = pd.read_csv('../mimic-cxr-2.0.0-metadata.csv')

In [4]:
mimic_cxr_jpg_df = mimic_cxr_jpg_df[['subject_id', 'study_id', 'StudyDate', 'StudyTime']]
mimic_cxr_jpg_df = mimic_cxr_jpg_df.drop_duplicates()
mimic_cxr_jpg_df = mimic_cxr_jpg_df.sort_values(['subject_id', 'StudyDate', 'StudyTime'], ascending=True)
mimic_cxr_jpg = mimic_cxr_jpg_df.values.tolist()
mimic_cxr_jpg = {
    (str(int(v[0])), str(int(v[1]))): (v[2], v[3])
    for v in mimic_cxr_jpg
}

In [5]:
class EvaluationType(Enum):
    NER = 0
    RELATIONS = 1
    TASK_1 = 2
    TASK_2 = 3

In [6]:
PREDICTIONS_FILE = '../../models/dygie-radgraph-base/predictions_test.jsonl'
LABELS_FILE = '../../data/dygie_test.json'
RADGRAPH_CLASSES = ['ANAT-DP', 'OBS-DP', 'OBS-DA', 'OBS-U', 'CHAN-IMP', 'CHAN-WOR', 'CHAN-AP', 'CHAN-DISA', 'CHAN-DISP', 'CHAN-NC']
RADGRAPH_RELATION_CLASSES = ['modify', 'located_at', 'suggestive_of']
CHANGE_CLASSES = ['CHAN-IMP', 'CHAN-WOR', 'CHAN-AP', 'CHAN-DISA', 'CHAN-DISP']
TASK_1_CLASSES = ['NO CHANGE', 'CHANGE']
TASK_2_CLASSES = CHANGE_CLASSES + ['CHAN-NC']

with open(PREDICTIONS_FILE, 'r') as f:
    lines = f.readlines()
data = [json.loads(line) for line in lines]

In [7]:
patient_studies_dict = defaultdict(lambda: {'change_mention': False, 'studies': []})
for data_sample in data:
    _, patient_id, study_id = data_sample['doc_key'].split('/')
    patient_id = patient_id[1:]
    study_id = study_id[1:-4]
    if patient_id == '':
        continue
    
    labels = data_sample['ner'][0]
    predictions = [[item[:3] for item in prediction] for prediction in data_sample['predicted_ner']][0]
    
    text = " ".join(data_sample['sentences'][0])
    study_data = {
        "text": " ".join(data_sample['sentences'][0]),
        "study_id": study_id,
        "labels": {},
        "predictions": {},
        "change_mention": False,
        "time": mimic_cxr_jpg[(patient_id, study_id)],
        "highlight_idxs": set()
    }

    change_mention = False
    highlight_idxs = set()
    radgraph_classes = TASK_2_CLASSES
    for radgraph_class in radgraph_classes:
        labels_of_class = set([(" ".join(text.split()[label[0]:label[1] + 1]), label[0], label[1]) for label in labels if label[2] == radgraph_class])
        predictions_of_class = set([(" ".join(text.split()[prediction[0]:prediction[1] + 1]), prediction[0], prediction[1]) for prediction in predictions if prediction[2] == radgraph_class])
        highlight_idxs.update(flatten([list(range(v[1], v[2] + 1)) for v in labels_of_class]))
        highlight_idxs.update(flatten([list(range(v[1], v[2] + 1)) for v in predictions_of_class]))
        if len(labels_of_class) > 0:
            change_mention = True
        if len(predictions_of_class) > 0:
            change_mention = True
        study_data['labels'][radgraph_class] = labels_of_class
        study_data['predictions'][radgraph_class] = predictions_of_class
        study_data['change_mention'] = change_mention
    study_data['highlight_idxs'] = highlight_idxs
        
    patient_studies_dict[patient_id]['change_mention'] = patient_studies_dict[patient_id]['change_mention'] or change_mention
    patient_studies_dict[patient_id]['studies'].append(study_data)

patient_studies_dict = {
    k: sorted(v['studies'], key=lambda s: s['time'])
    for k, v in patient_studies_dict.items()
    if v['change_mention'] and len(v['studies']) > 1
}

## Patient timelines

In [8]:
for patient_id, patient_studies in patient_studies_dict.items():
    print(f"###############################################################")
    print(f"################ PATIENT {patient_id} #########################")
    print(f"###############################################################")
    for study in patient_studies:
        print()
        print(f"================== Study {study['study_id']} ==================")
        print(f"Time: {study['time']}")
        print(highlight_words(study['text'], study['highlight_idxs']))
        print(f"Labels: {study['labels']}")
        print(f"Predictions: {study['predictions']}")
        print()

###############################################################
################ PATIENT 10001884 #########################
###############################################################

Time: (21261103.0, 220052.734)
FINAL REPORT HISTORY : ___ - year - old female with chest tightness . STUDY : PA and lateral chest radiograph . COMPARISON : ___ . FINDINGS : The cardiomediastinal and hilar contours are normal . The lungs are clear . There is no pleural effusion or pneumothorax . IMPRESSION : No acute cardiopulmonary process .
Labels: {'CHAN-IMP': set(), 'CHAN-WOR': set(), 'CHAN-AP': set(), 'CHAN-DISA': set(), 'CHAN-DISP': set(), 'CHAN-NC': set()}
Predictions: {'CHAN-IMP': set(), 'CHAN-WOR': set(), 'CHAN-AP': set(), 'CHAN-DISA': set(), 'CHAN-DISP': set(), 'CHAN-NC': set()}


Time: (21270724.0, 161830.718)
FINAL REPORT INDICATION : Intermittent chest pain , evaluate for pneumonia . COMPARISON : Chest radiograph from ___ and CT chest from ___ . CHEST , PA AND LATERAL : Mild pleural paren

In [9]:
def compute_metrics(relations):
    metrics_dict = defaultdict(lambda: {'tps': 0, 'fps': 0, 'fns': 0, 'total_actual': 0, 'total_predicted': 0})
    for data_sample in data:
        labels = data_sample['relations' if relations else 'ner'][0]
        predictions = [[item[:(5 if relations else 3)] for item in prediction] for prediction in data_sample['predicted_relations' if relations else 'predicted_ner']][0]

        radgraph_classes = RADGRAPH_RELATION_CLASSES if relations else RADGRAPH_CLASSES
        for radgraph_class in radgraph_classes:
            labels_of_class = set([tuple(label) for label in labels if label[4 if relations else 2] == radgraph_class])
            predictions_of_class = set([tuple(prediction) for prediction in predictions if prediction[4 if relations else 2] == radgraph_class])
            metrics_dict[radgraph_class]['tps'] += len(labels_of_class & predictions_of_class)
            metrics_dict[radgraph_class]['fps'] += len(predictions_of_class - labels_of_class)
            metrics_dict[radgraph_class]['fns'] += len(labels_of_class - predictions_of_class)
            metrics_dict[radgraph_class]['total_actual'] += len(labels_of_class)
            metrics_dict[radgraph_class]['total_predicted'] += len(predictions_of_class)

    return metrics_dict

def compute_metrics_task_1():
    metrics_dict = defaultdict(lambda: {'tps': 0, 'fps': 0, 'fns': 0, 'total_actual': 0, 'total_predicted': 0})
    for data_sample in data:
        labels = [item[2] for item in data_sample['ner'][0]]
        predictions = [[item[2] for item in prediction] for prediction in data_sample['predicted_ner']][0]
        
        chan_nc_label = 0
        chan_nc_prediction = 0
        if 'CHAN-NC' in labels:
            chan_nc_label = 1
        if 'CHAN-NC' in predictions:
            chan_nc_prediction = 1
            
        chan_label = 0
        chan_prediction = 0
        if any([change_class in labels for change_class in CHANGE_CLASSES]):
            chan_label = 1
        if any([change_class in predictions for change_class in CHANGE_CLASSES]):
            chan_prediction = 1
            
        metrics_dict['NO CHANGE']['tps'] += chan_nc_label & chan_nc_prediction
        metrics_dict['NO CHANGE']['fps'] += max(0, chan_nc_prediction - chan_nc_label)
        metrics_dict['NO CHANGE']['fns'] += max(0, chan_nc_label - chan_nc_prediction)
        metrics_dict['NO CHANGE']['total_actual'] += chan_nc_label
        metrics_dict['NO CHANGE']['total_predicted'] += chan_nc_prediction
        metrics_dict['CHANGE']['tps'] += chan_label & chan_prediction
        metrics_dict['CHANGE']['fps'] += max(0, chan_prediction - chan_label)
        metrics_dict['CHANGE']['fns'] += max(0, chan_label - chan_prediction)
        metrics_dict['CHANGE']['total_actual'] += chan_label
        metrics_dict['CHANGE']['total_predicted'] += chan_prediction

    return metrics_dict

def compute_metrics_task_2():
    metrics_dict = defaultdict(lambda: {'tps': 0, 'fps': 0, 'fns': 0, 'total_actual': 0, 'total_predicted': 0})
    for data_sample in data:
        labels = data_sample['ner'][0]
        predictions = [[item[:3] for item in prediction] for prediction in data_sample['predicted_ner']][0]
            
        radgraph_classes = TASK_2_CLASSES
        for radgraph_class in radgraph_classes:
            labels_of_class = set([tuple(label) for label in labels if label[2] == radgraph_class])
            label_of_class = 1 if len(labels_of_class) > 0 else 0
            predictions_of_class = set([tuple(prediction) for prediction in predictions if prediction[2] == radgraph_class])
            prediction_of_class = 1 if len(predictions_of_class) > 0 else 0
            metrics_dict[radgraph_class]['tps'] += label_of_class & prediction_of_class
            metrics_dict[radgraph_class]['fps'] +=  max(0, prediction_of_class - label_of_class)
            metrics_dict[radgraph_class]['fns'] += max(0, label_of_class - prediction_of_class)
            metrics_dict[radgraph_class]['total_actual'] += label_of_class
            metrics_dict[radgraph_class]['total_predicted'] += prediction_of_class

    return metrics_dict

In [13]:
def print_metrics(metrics_dict, evaluation_type):
    if evaluation_type is EvaluationType.NER: 
        radgraph_classes = RADGRAPH_CLASSES
    elif evaluation_type is EvaluationType.RELATIONS:
        radgraph_classes = RADGRAPH_RELATION_CLASSES
    elif evaluation_type is EvaluationType.TASK_1:
        radgraph_classes = TASK_1_CLASSES
    elif evaluation_type is EvaluationType.TASK_2:
        radgraph_classes = TASK_2_CLASSES
    total_tps = 0
    total_fps = 0
    total_fns = 0
    macro_precision = 0
    macro_recall = 0
    macro_f1 = 0
    for radgraph_class in radgraph_classes:
        tps = np.float64(metrics_dict[radgraph_class]['tps'])
        total_tps += tps
        fps = np.float64(metrics_dict[radgraph_class]['fps'])
        total_fps += fps
        fns = np.float64(metrics_dict[radgraph_class]['fns'])
        total_fns += fns
        total_actual = metrics_dict[radgraph_class]['total_actual']
        total_predicted = metrics_dict[radgraph_class]['total_predicted']
        precision = tps / (tps + fps)
        macro_precision += np.nan_to_num(precision, nan=0)
        recall = tps / (tps + fns)
        macro_recall += np.nan_to_num(recall, nan=0)
        f1 = 2 * precision * recall / (precision + recall)
        macro_f1 += np.nan_to_num(f1, nan=0)
        print(f"* Class {radgraph_class}")
        print(f"  - Precision: {precision}")
        print(f"  - Recall: {recall}")
        print(f"  - F1: {f1}")
        print(f"  - Total actual: {total_actual}")
        print(f"  - Total predicted: {total_predicted}")
    micro_precision = total_tps / (total_tps + total_fps)
    micro_recall = total_tps / (total_tps + total_fns)
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
    macro_precision /= len(radgraph_classes)
    macro_recall /= len(radgraph_classes)
    macro_f1 /= len(radgraph_classes)
    print()
    print(f"* Micro precision: {micro_precision}")
    print(f"* Micro recall: {micro_recall}")
    print(f"* Micro F1: {micro_f1}")
    print(f"* Macro precision: {macro_precision}")
    print(f"* Macro recall: {macro_recall}")
    print(f"* Macro F1: {macro_f1}")

## Task 3

In [14]:
metrics_dict = compute_metrics(False)
print_metrics(metrics_dict, EvaluationType.NER)

* Class ANAT-DP
  - Precision: 0.9538855678906917
  - Recall: 0.9547008547008548
  - F1: 0.9542930371636054
  - Total actual: 1170
  - Total predicted: 1171
* Class OBS-DP
  - Precision: 0.8302073050345509
  - Recall: 0.8503538928210314
  - F1: 0.8401598401598402
  - Total actual: 989
  - Total predicted: 1013
* Class OBS-DA
  - Precision: 0.9253012048192771
  - Recall: 0.9434889434889435
  - F1: 0.9343065693430658
  - Total actual: 407
  - Total predicted: 415
* Class OBS-U
  - Precision: 0.616822429906542
  - Recall: 0.75
  - F1: 0.676923076923077
  - Total actual: 88
  - Total predicted: 107
* Class CHAN-IMP
  - Precision: 0.9
  - Recall: 1.0
  - F1: 0.9473684210526316
  - Total actual: 9
  - Total predicted: 10
* Class CHAN-WOR
  - Precision: 0.6551724137931034
  - Recall: 0.8636363636363636
  - F1: 0.7450980392156864
  - Total actual: 22
  - Total predicted: 29
* Class CHAN-AP
  - Precision: 0.4166666666666667
  - Recall: 1.0
  - F1: 0.5882352941176471
  - Total actual: 5
  - Tota



In [15]:
metrics_dict = compute_metrics(True)
print_metrics(metrics_dict, EvaluationType.RELATIONS)

* Class modify
  - Precision: 0.7561950439648282
  - Recall: 0.7293754818812644
  - F1: 0.7425431711145998
  - Total actual: 1297
  - Total predicted: 1251
* Class located_at
  - Precision: 0.8448023426061494
  - Recall: 0.8219373219373219
  - F1: 0.8332129963898917
  - Total actual: 702
  - Total predicted: 683
* Class suggestive_of
  - Precision: 0.6575342465753424
  - Recall: 0.5161290322580645
  - F1: 0.5783132530120482
  - Total actual: 93
  - Total predicted: 73

* Micro precision: 0.7827603388141505
* Micro recall: 0.7509560229445507
* Micro F1: 0.7665284215662356
* Macro precision: 0.7528438777154399
* Macro recall: 0.689147278692217
* Macro F1: 0.7180231401721798


## Task 1

In [16]:
metrics_dict = compute_metrics_task_1()
print_metrics(metrics_dict, EvaluationType.TASK_1)

* Class NO CHANGE
  - Precision: 0.9074074074074074
  - Recall: 0.9245283018867925
  - F1: 0.9158878504672898
  - Total actual: 53
  - Total predicted: 54
* Class CHANGE
  - Precision: 0.7941176470588235
  - Recall: 0.9
  - F1: 0.84375
  - Total actual: 30
  - Total predicted: 34

* Micro precision: 0.8636363636363636
* Micro recall: 0.9156626506024096
* Micro F1: 0.8888888888888888
* Macro precision: 0.8507625272331154
* Macro recall: 0.9122641509433962
* Macro F1: 0.8798189252336449


## Task 2

In [17]:
metrics_dict = compute_metrics_task_2()
print_metrics(metrics_dict, EvaluationType.TASK_2)

* Class CHAN-IMP
  - Precision: 1.0
  - Recall: 1.0
  - F1: 1.0
  - Total actual: 7
  - Total predicted: 7
* Class CHAN-WOR
  - Precision: 0.6
  - Recall: 0.9230769230769231
  - F1: 0.7272727272727274
  - Total actual: 13
  - Total predicted: 20
* Class CHAN-AP
  - Precision: 0.4444444444444444
  - Recall: 1.0
  - F1: 0.6153846153846153
  - Total actual: 4
  - Total predicted: 9
* Class CHAN-DISA
  - Precision: 0.7142857142857143
  - Recall: 0.8333333333333334
  - F1: 0.7692307692307692
  - Total actual: 6
  - Total predicted: 7
* Class CHAN-DISP
  - Precision: 0.6666666666666666
  - Recall: 0.6666666666666666
  - F1: 0.6666666666666666
  - Total actual: 3
  - Total predicted: 3
* Class CHAN-NC
  - Precision: 0.9074074074074074
  - Recall: 0.9245283018867925
  - F1: 0.9158878504672898
  - Total actual: 53
  - Total predicted: 54

* Micro precision: 0.79
* Micro recall: 0.9186046511627907
* Micro F1: 0.8494623655913979
* Macro precision: 0.7221340388007054
* Macro recall: 0.891267537493