In [1]:
"""
NOTES:
    - this notebook has two parts:
        (1) clean the `reference`, or the raw, unannotated documents
        (2) map the annotations back onto this reference
    - accuracy is more important than speed
"""

'\nNOTES:\n    - this notebook has two parts:\n        (1) clean the `reference`, or the raw, unannotated documents\n        (2) map the annotations back onto this reference\n    - accuracy is more important than speed\n'

In [2]:
import sys
import os
import json
import pandas as pd
import hashlib
import numpy as np
import spacy
import re
import random
from importlib import reload
from datetime import datetime
from collections import defaultdict
from pprint import pprint
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import difflib

#local
import utility_funcs

In [3]:
%matplotlib inline

In [4]:
nlp = spacy.load('en_core_web_lg')

## Parse Unannotated Documents

In [5]:
reload(utility_funcs)

def cosemetic_clean(document):
    """A function to perform minor text processing
    in hopes of making sentence parsing more robust
    
    Args:
        - document (str): a document
        
    Returns:
        clean_document (str): a document with minor adjustments    
    """
    decoded_doc = utility_funcs.force_encoding(document)
    clean_document = re.sub(' +', ' ', decoded_doc).replace("_", "")
    return clean_document
    
    
def parse_documents(d_map):
    """A function to add parsed documents to dmap
    
    NOTE: modifies `d_map` in place, does not make a copy
    
    Args:
        - d_map (dict): a document map created in the preprocessing
            step
            
    Returns:
        - d_map (dict): a document map created in the preprocessing
            step
    """
    for ICD_id, content in d_map.items():
        pre_parse = cosemetic_clean(content['raw_content'])
        parsed = nlp(pre_parse)
        sentences = list(parsed.sents)
        
        content['parsed'] = parsed
        content['sentences'] = sentences
        
    return d_map

In [6]:
%time
d_map_path = "processed_annotations/DOCUMENT_MAP_02-14-2020.json"
d_map = json.load(open(d_map_path))

d_map = parse_documents(d_map)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [7]:
d_map['10932687'].keys()

dict_keys(['raw_content', 'from_file', 'parsed', 'sentences'])

In [8]:
"""
I'll build a reference data frame with all sentences
"""
new_rows = []

for ICD_id, content in d_map.items():
    for sent in content['sentences']:
        
        row = {
            'ICD_doc_id':str(ICD_id),
            'sentence': sent.text,
            'start_pos':sent.start_char,
            'num_words':len(sent),
            'num_chars':len(sent.text),
            'sentence_obj': sent,
        }
        new_rows.append(row)
        
ref = pd.DataFrame(new_rows)
ref.head()

Unnamed: 0,ICD_doc_id,sentence,start_pos,num_words,num_chars,sentence_obj
0,95581557,Ct) MERCY HEALTH \n,0,5,18,"(Ct, ), MERCY, HEALTH, \n)"
1,95581557,"GRAND RAPIDS,",18,3,13,"(GRAND, RAPIDS, ,)"
2,95581557,Ml \nCONSENT,32,3,11,"(Ml, \n, CONSENT)"
3,95581557,TO RECEIVE \n,44,3,12,"(TO, RECEIVE, \n)"
4,95581557,Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...,56,9,50,"(Patient, Label, \n\n, NON, -, EMERGENCY, TRAN..."


In [9]:
ref['sentence'].head()

0                                  Ct) MERCY HEALTH \n
1                                        GRAND RAPIDS,
2                                         Ml \nCONSENT
3                                        TO RECEIVE \n
4    Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...
Name: sentence, dtype: object

In [10]:
"""
save the reference
"""
ref.to_csv("processed_annotations/reference.csv", index=False)

## Load (and save) Annotations

In [11]:
"""
Load the preprocessed annotations
"""

df = pd.read_csv("processed_annotations/ANNOTATIONS_02-14-2020.csv")
df.head()

Unnamed: 0,ICD_doc_id,json_filename,annotator,annotation_id,A,B,C,start_char,end_char,text,sentence_count
0,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,22668539,1,0,0,2613,2795,"By signing this form, I am requesting and givi...",1
1,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,66236403,0,0,1,366,558,I understand that blood or blood products will...,1
2,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,66236403,0,0,1,366,558,My doctor will determine the amount of blood o...,2
3,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,97685314,1,0,0,323,516,I authorize the release of any and all medical...,1
4,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,39472471,1,0,0,104,321,I voluntarily consent to medical care of a rou...,1


In [12]:
# """
# Save all A or B annotations by annotator
# """

# for annotator in set(df.annotator):
#     annotator_df = df[(df.annotator == f"{annotator}") & ((df.A == 1) | (df.B == 1))]
#     annotator_df.to_csv(f"processed_annotations/{annotator}_AB_only.csv", index=False)

In [13]:
AB_only = df[((df.A == 1) | (df.B == 1))]

AB_only = AB_only.drop(['json_filename', 
                        'annotator',
                        'A',
                        'B',
                        'C', 
                        'start_char', 
                        'end_char'], axis=1)

print(f"BEFORE dropping duplicates {len(AB_only)}")
AB_only = AB_only.drop_duplicates('text')
print(f"AFTER dropping duplicates {len(AB_only)}")
AB_only.head()


# AB_only.to_csv("processed_annotations/ALL_UNIQUE_ANNOTATIONS.csv")

BEFORE dropping duplicates 1782
AFTER dropping duplicates 824


Unnamed: 0,ICD_doc_id,annotation_id,text,sentence_count
0,95581557,22668539,"By signing this form, I am requesting and givi...",1
3,69408590,97685314,I authorize the release of any and all medical...,1
4,69408590,39472471,I voluntarily consent to medical care of a rou...,1
5,36073164,83827607,Your signature below indicates that you unders...,1
9,36073164,57070687,As the patient/patient's authorized representa...,1


# Normalize Annotations

In [14]:
"""
Add rows for matching - prevents having to resolve things dynalically
"""

for ann in set(df['annotator']):
    for label in ['A', 'B']:
        AB_only[f"{ann}_{label}"] = 0
    AB_only[f"{ann}_text"] = ""
    
[x for x in AB_only.columns]

['ICD_doc_id',
 'annotation_id',
 'text',
 'sentence_count',
 'KATHLEEN_A',
 'KATHLEEN_B',
 'KATHLEEN_text',
 'LIZ_A',
 'LIZ_B',
 'LIZ_text',
 'KAYCEE_A',
 'KAYCEE_B',
 'KAYCEE_text']

In [15]:
df.head()

Unnamed: 0,ICD_doc_id,json_filename,annotator,annotation_id,A,B,C,start_char,end_char,text,sentence_count
0,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,22668539,1,0,0,2613,2795,"By signing this form, I am requesting and givi...",1
1,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,66236403,0,0,1,366,558,I understand that blood or blood products will...,1
2,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,66236403,0,0,1,366,558,My doctor will determine the amount of blood o...,2
3,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,97685314,1,0,0,323,516,I authorize the release of any and all medical...,1
4,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,KATHLEEN,39472471,1,0,0,104,321,I voluntarily consent to medical care of a rou...,1


In [17]:
"""
GOAL: reduce multi-row annotations based on unique text spans

(1) for each uniquely annotated sentence
(2) find all annotations of that sentence
(2) set new columns based on annotations
"""


for idx, row in AB_only.iterrows():
    did, aid = row['ICD_doc_id'], row['annotation_id']
    
    matches = df[(df['ICD_doc_id'] == did) & (df['annotation_id'] == aid)]
    
    for midx, match_row in matches.iterrows():
        if row['sentence_count'] == match_row['sentence_count']:
            annotator = match_row['annotator']
            A_val = match_row['A']
            B_val = match_row['B']
            matched_text = match_row['text']

            ## normalize each row of the data frame

            AB_only.at[idx,'KAYCEE_A'] = A_val if annotator == 'KAYCEE' else 0
            AB_only.at[idx,'KAYCEE_B'] = B_val if annotator == 'KAYCEE' else 0
            AB_only.at[idx,'KAYCEE_text'] = matched_text if annotator == 'KAYCEE' else ""
            AB_only.at[idx,'LIZ_A'] = A_val if annotator == 'LIZ' else 0
            AB_only.at[idx,'LIZ_B'] = B_val if annotator == 'LIZ' else 0
            AB_only.at[idx,'LIZ_text'] = matched_text if annotator == 'LIZ' else ""
            AB_only.at[idx,'KATHLEEN_A'] = A_val if annotator == 'KATHLEEN' else 0
            AB_only.at[idx,'KATHLEEN_B'] = B_val if annotator == 'KATHLEEN' else 0
            AB_only.at[idx,'KATHLEEN_text'] = matched_text if annotator == 'KATHLEEN' else ""
            AB_only.at[idx,'KATHLEEN_text'] = matched_text if annotator == 'KATHLEEN' else ""

    
AB_only.to_csv("processed_annotations/NORMALIZED_AB_ONLY.csv")

print(f"AB_only length {len(AB_only)}") # expect 824

AB_only length 824


In [18]:
AB_only.head()

Unnamed: 0,ICD_doc_id,annotation_id,text,sentence_count,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,LIZ_A,LIZ_B,LIZ_text,KAYCEE_A,KAYCEE_B,KAYCEE_text
0,95581557,22668539,"By signing this form, I am requesting and givi...",1,0,0,,0,0,,1,0,"By signing this form, I am requesting and givi..."
3,69408590,97685314,I authorize the release of any and all medical...,1,0,0,,0,0,,1,0,I authorize the release of any and all medical...
4,69408590,39472471,I voluntarily consent to medical care of a rou...,1,0,0,,0,0,,1,0,I voluntarily consent to medical care of a rou...
5,36073164,83827607,Your signature below indicates that you unders...,1,0,0,,0,0,,1,0,Your signature below indicates that you unders...
9,36073164,57070687,As the patient/patient's authorized representa...,1,0,0,,1,0,As the patient/patient's authorized representa...,0,0,


In [19]:
"""
Inclusion/exclusion of annotations
"""

def incl_excl(text):
    """Return 1 for inclusion, 0 for exclusion"""
    sent_len = len(text.split(" "))

    if sent_len < 5:
        return 0
    else:
        return 1
    
    print(len(text.split(" ")))
    
    
AB_only['inclusion'] = AB_only['text'].apply(lambda row: incl_excl(row))
AB_only.head()

Unnamed: 0,ICD_doc_id,annotation_id,text,sentence_count,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,LIZ_A,LIZ_B,LIZ_text,KAYCEE_A,KAYCEE_B,KAYCEE_text,inclusion
0,95581557,22668539,"By signing this form, I am requesting and givi...",1,0,0,,0,0,,1,0,"By signing this form, I am requesting and givi...",1
3,69408590,97685314,I authorize the release of any and all medical...,1,0,0,,0,0,,1,0,I authorize the release of any and all medical...,1
4,69408590,39472471,I voluntarily consent to medical care of a rou...,1,0,0,,0,0,,1,0,I voluntarily consent to medical care of a rou...,1
5,36073164,83827607,Your signature below indicates that you unders...,1,0,0,,0,0,,1,0,Your signature below indicates that you unders...,1
9,36073164,57070687,As the patient/patient's authorized representa...,1,0,0,,1,0,As the patient/patient's authorized representa...,0,0,,1


In [20]:
print(f"BEFORE removing short annotations: {len(AB_only)}")
AB_only = AB_only[AB_only['inclusion'] == 1]
print(f"AFTER removing short annotations: {len(AB_only)}")

BEFORE removing short annotations: 824
AFTER removing short annotations: 774


# Alignment

In [21]:
"""
clean up the sentences we're trying to align
"""


def clean_sentence(row):
    """Apply function to create a human readable sentence
    """
    return re.sub("\s\s+" , " ", row.rstrip('\r\n'))
    
ref['sentence_string'] = ref['sentence'].apply(lambda row: clean_sentence(row))

ref.head()

Unnamed: 0,ICD_doc_id,sentence,start_pos,num_words,num_chars,sentence_obj,sentence_string
0,95581557,Ct) MERCY HEALTH \n,0,5,18,"(Ct, ), MERCY, HEALTH, \n)",Ct) MERCY HEALTH
1,95581557,"GRAND RAPIDS,",18,3,13,"(GRAND, RAPIDS, ,)","GRAND RAPIDS,"
2,95581557,Ml \nCONSENT,32,3,11,"(Ml, \n, CONSENT)",Ml CONSENT
3,95581557,TO RECEIVE \n,44,3,12,"(TO, RECEIVE, \n)",TO RECEIVE
4,95581557,Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...,56,9,50,"(Patient, Label, \n\n, NON, -, EMERGENCY, TRAN...",Patient Label NON-EMERGENCY TRANSFUSION OF BLOOD


In [22]:
"""
Add rows to the reference frame for matching
"""

for ann in set(df['annotator']):
    for label in ['A', 'B']:
        ref[f"{ann}_{label}"] = 0
    ref[f"{ann}_text"] = ""
    ref['annotation_id'] = ""
    
[x for x in ref.columns]

['ICD_doc_id',
 'sentence',
 'start_pos',
 'num_words',
 'num_chars',
 'sentence_obj',
 'sentence_string',
 'KATHLEEN_A',
 'KATHLEEN_B',
 'KATHLEEN_text',
 'annotation_id',
 'LIZ_A',
 'LIZ_B',
 'LIZ_text',
 'KAYCEE_A',
 'KAYCEE_B',
 'KAYCEE_text']

In [23]:
ref.head()

Unnamed: 0,ICD_doc_id,sentence,start_pos,num_words,num_chars,sentence_obj,sentence_string,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,annotation_id,LIZ_A,LIZ_B,LIZ_text,KAYCEE_A,KAYCEE_B,KAYCEE_text
0,95581557,Ct) MERCY HEALTH \n,0,5,18,"(Ct, ), MERCY, HEALTH, \n)",Ct) MERCY HEALTH,0,0,,,0,0,,0,0,
1,95581557,"GRAND RAPIDS,",18,3,13,"(GRAND, RAPIDS, ,)","GRAND RAPIDS,",0,0,,,0,0,,0,0,
2,95581557,Ml \nCONSENT,32,3,11,"(Ml, \n, CONSENT)",Ml CONSENT,0,0,,,0,0,,0,0,
3,95581557,TO RECEIVE \n,44,3,12,"(TO, RECEIVE, \n)",TO RECEIVE,0,0,,,0,0,,0,0,
4,95581557,Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...,56,9,50,"(Patient, Label, \n\n, NON, -, EMERGENCY, TRAN...",Patient Label NON-EMERGENCY TRANSFUSION OF BLOOD,0,0,,,0,0,,0,0,


In [24]:
def include_exclude(row):
    """return 1 or 0 based on inclusion exclusion criteria """
    
    if len(row['sentence_obj']) < 9:
        return 0
    else:
        return 1

ref['inclusion'] = ref.apply(lambda row: include_exclude(row), axis=1)
    

In [25]:
def slow_match(annotation, reference_sents):
    """A function to handle hard matches
    reference_sents should be a spacy obj
    """
    max_score = 0
    best_sent = ""
    
    for sent in reference_sents:  
        sem_similarity = sent.similarity(nlp(annotation))
        
        ann_split = [x.lower() for x in annotation.split(" ")]
        ref_split = [x.lower() for x in sent.text.split(" ")]
        n_shared_words = len(list(set(ann_split).intersection(ref_split)))
        
        # these are intersting, but ineffective
#         length_diff = np.sqrt((len(annotation) - len(sent.text))**2)
#         w_score = sem_similarity / length_diff
        score = n_shared_words / len(annotation)

        if score > max_score:
            max_score = score
            best_sent = sent
    print(f"BEST MATCH FOUND: {best_sent}")
    
    return [best_sent]


for idx, row in AB_only.iterrows():
    annotated_text = row['text'] 
    did = row['ICD_doc_id']
    
    doc_matches = ref[(ref['ICD_doc_id'] == str(did)) & (ref['inclusion'] == 1)]
    
    matches = difflib.get_close_matches(annotated_text, 
                                        doc_matches['sentence_string'].tolist(),
                                        cutoff=.5)
    """
    handle non-perfect matching
    """
    
    if len(matches) < 1:
        print("--------------------------------------------")
        print(f"ERROR - no easy match for: {annotated_text} in {did} '\n'")
        
        matches = slow_match(annotated_text, doc_matches['sentence_obj'])
        
        
        if len(matches) < 1:
            raise ValueError(f"CANNOT MATCH: {annotated_text} in {did}")
    
    ## get index of best match
    ref_idx = ref[(ref['sentence_string'] == matches[0])].index

    ref.at[ref_idx,'KAYCEE_A'] = row['KAYCEE_A']
    ref.at[ref_idx,'KAYCEE_B'] = row['KAYCEE_B']
    ref.at[ref_idx,'KAYCEE_text'] = row['KAYCEE_text']
    ref.at[ref_idx,'LIZ_A'] = row['LIZ_A']
    ref.at[ref_idx,'LIZ_B'] = row['LIZ_B']
    ref.at[ref_idx,'LIZ_text'] = row['LIZ_text']
    ref.at[ref_idx,'KATHLEEN_A'] = row['KATHLEEN_A']
    ref.at[ref_idx,'KATHLEEN_B'] = row['KATHLEEN_B']
    ref.at[ref_idx,'KATHLEEN_text'] = row['KATHLEEN_text']
    ref.at[ref_idx, 'annotation_id'] = row['annotation_id']


ref.head()    

--------------------------------------------
ERROR - no easy match for: I/We, desire to be therapeutically inseminated for the purpose of conceiving a child to be treated in all respects as the natural child of myself/ourselves, freely and knowingly agreeing to the terms of this consent and understanding that I/we are bound to it. in 37497740 '
'
BEST MATCH FOUND: We,  desire to be therapeutically inseminated for the purpose of conceiving a child to be treated in all respects as the natural child of myself/ourselves, freely and knowingly agreeing to the terms of this consent and understanding that I/
--------------------------------------------
ERROR - no easy match for: Transfusion is not applicable to my operation in 27509790 '
'
BEST MATCH FOUND: Transfusion is not applicable to my operation Exceptions (TO BE COMPLETED BY PROVIDER ONLY): 

I HAVE READ AND UNDERSTAND THE INFORMATION ON THIS FORM AND ON THE PREVIOUS PAGES BEFORE I SIGNED IT. 

-----------------------------------------

Unnamed: 0,ICD_doc_id,sentence,start_pos,num_words,num_chars,sentence_obj,sentence_string,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,annotation_id,LIZ_A,LIZ_B,LIZ_text,KAYCEE_A,KAYCEE_B,KAYCEE_text,inclusion
0,95581557,Ct) MERCY HEALTH \n,0,5,18,"(Ct, ), MERCY, HEALTH, \n)",Ct) MERCY HEALTH,0,0,,,0,0,,0,0,,0
1,95581557,"GRAND RAPIDS,",18,3,13,"(GRAND, RAPIDS, ,)","GRAND RAPIDS,",0,0,,,0,0,,0,0,,0
2,95581557,Ml \nCONSENT,32,3,11,"(Ml, \n, CONSENT)",Ml CONSENT,0,0,,,0,0,,0,0,,0
3,95581557,TO RECEIVE \n,44,3,12,"(TO, RECEIVE, \n)",TO RECEIVE,0,0,,,0,0,,0,0,,0
4,95581557,Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...,56,9,50,"(Patient, Label, \n\n, NON, -, EMERGENCY, TRAN...",Patient Label NON-EMERGENCY TRANSFUSION OF BLOOD,0,0,,,0,0,,0,0,,1


In [27]:
ref.to_csv("processed_annotations/ALIGNED_ANNOTATIONS.csv")

In [35]:
cannon = ref.copy()

# drop spaCy artifacts
cannon = cannon.drop(['sentence_obj', 
                      'start_pos'], axis=1)

cannon['Sum_Annotations'] = cannon['LIZ_A'] + cannon['KAYCEE_A'] + cannon['KATHLEEN_A'] +\
                            cannon['LIZ_B'] + cannon['KAYCEE_B'] + cannon['KATHLEEN_B']


def any_A(row):
    if np.sum([row['LIZ_A'],
               row['KAYCEE_A'],
               row['KATHLEEN_A']]) > 0:
        return 1
    else:
        return 0

cannon['ANY_A'] = cannon.apply(lambda row: any_A(row), axis=1)


def any_B(row):
    if np.sum([row['LIZ_B'],
               row['KAYCEE_B'],
               row['KATHLEEN_B']]) > 0:
        return 1
    else:
        return 0

cannon['ANY_B'] = cannon.apply(lambda row: any_B(row), axis=1)


def eitherAB(row):
    if np.sum([row['ANY_A'],
               row['ANY_B']]) > 0:
        return 1
    else:
        return 0

cannon['EITHER_AB'] = cannon.apply(lambda row: eitherAB(row), axis=1)

cannon.to_csv("processed_annotations/CANNONICAL_LIST.csv")
cannon.head()

Unnamed: 0,ICD_doc_id,sentence,num_words,num_chars,sentence_string,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,annotation_id,LIZ_A,LIZ_B,LIZ_text,KAYCEE_A,KAYCEE_B,KAYCEE_text,inclusion,Sum_Annotations,ANY_A,ANY_B,EITHER_AB
0,95581557,Ct) MERCY HEALTH \n,5,18,Ct) MERCY HEALTH,0,0,,,0,0,,0,0,,0,0,0,0,0
1,95581557,"GRAND RAPIDS,",3,13,"GRAND RAPIDS,",0,0,,,0,0,,0,0,,0,0,0,0,0
2,95581557,Ml \nCONSENT,3,11,Ml CONSENT,0,0,,,0,0,,0,0,,0,0,0,0,0
3,95581557,TO RECEIVE \n,3,12,TO RECEIVE,0,0,,,0,0,,0,0,,0,0,0,0,0
4,95581557,Patient Label \n\nNON-EMERGENCY TRANSFUSION OF...,9,50,Patient Label NON-EMERGENCY TRANSFUSION OF BLOOD,0,0,,,0,0,,0,0,,1,0,0,0,0


## Align these puppies

In [None]:
ref.dtypes

In [None]:
random_doc = random.sample(d_map.keys(), 1)[0]
    
for idx, ref_row in ref[ref.ICD_doc_id == random_doc].iterrows():
    
    reference_sent = ref_row['sentece']     
    inter_doc_annotations = df[df['ICD_doc_id'] == int(ref_row['ICD_doc_id'])]
    
    print(len(inter_doc_annotations))

In [None]:
# threshold = 0.99
# new_rows = []

# def score_pair(reference__sentence, annotated_sentence):
#     """A function to score a pair of sentences for similarity
    
#     Args:
#         - reference__sentence (str): the reference sentence
#         - annotated_sentence (str): the canndidate sentence (annotation)
        
#     Returns:
#         - score (float): a likelihood that the sentences are the same
#     """
    
    
#     ## TODO: need to handle empty vectors
#     sim = nlp(reference__sentence).similarity(nlp(annotated_sentence))
#     len_dff_sq = (len(reference__sentence) - len(annotated_sentence))**2
#     if len_dff_sq == 0:
#         len_dff_sq = 1
    
#     len_weighted_sim = (1/len_dff_sq)*sim
    
#     return len_weighted_sim
    

# random_doc = random.sample(d_map.keys(), 1)[0]
    
# for idx, ref_row in ref[ref.ICD_doc_id == random_doc].iterrows():
    
#     reference_sent = ref_row['sentece']     
#     inter_doc_annotations = df[df['ICD_doc_id'] == int(ref_row['ICD_doc_id'])]
    
# #     print(f"Number of annotations: {len(inter_doc_annotations)}")
    
#     matches = 0
#     for aidx, annotation_row in inter_doc_annotations.iterrows():
#         annotated_text = annotation_row['text']
#         annotator = annotation_row['annotator']
#         A = annotation_row['A']
#         B = annotation_row['B']
#         C = annotation_row['C']
#         score = score_pair(reference_sent, annotated_text)
        
#         if score > threshold:
#             matches += 1
#             print("---------------------------------------------------------------------------")
#             print("---------------------------------------------------------------------------")
#             print("---------------------------------------------------------------------------")

#             print(f"score: {score:.3f}")
#             print(f"REFERNCE: {reference_sent.strip()}")
#             print(f"ANNOTATION: {annotated_text.strip()}")
#             print(f"ANNOTATOR: `{annotator}` ----> LABELS: (A={A}, B={B}, C={C})")
#             print()
            
# #     print(f"Number of Matches = {matches}")
    

# #         scores.append(score_pair(reference_sent, annotated_text))
    
# #     print(f"REFERENCE: {reference_sent.strip()}: MAX SCORE: {np.max(scores)}")
# #     print(f"MAX SCORE SENT: {str(inter_doc_annotations.iloc[[np.argmax(scores)]]['text'])}")
    
# #     print("---------------------------------------------------------------------------")
# #     print("---------------------------------------------------------------------------")
# #     print("---------------------------------------------------------------------------")

    
#     """
#     if max score > THRESHOLD:
#         add annotation data and other data to new_rows
#     else:
#         move on
#     """
    
