In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import cohen_kappa_score

In [2]:
df = pd.read_csv("processed_annotations/CANNONICAL_LIST.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ICD_doc_id,sentence,num_words,num_chars,sentence_string,KATHLEEN_A,KATHLEEN_B,KATHLEEN_text,annotation_id,...,KAYCEE_B,KAYCEE_text,LIZ_A,LIZ_B,LIZ_text,inclusion,Sum_Annotations,ANY_A,ANY_B,EITHER_AB
0,0,95581557,"Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",23,106,"Ct) MERCY HEALTH GRAND RAPIDS, Ml CONSENT TO R...",0,0,,,...,0,,0,0,,1,0,0,0,0
1,1,95581557,AND/OR BLOOD PRODUCTS \n,6,23,AND/OR BLOOD PRODUCTS,0,0,,,...,0,,0,0,,0,0,0,0,0
2,2,95581557,I understand that I need or may need blood and...,27,130,I understand that I need or may need blood and...,0,0,,71891838.0,...,0,I understand that I need or may need blood and...,0,0,,1,1,1,0,1
3,3,95581557,This hospital admission D Outpatient: Series o...,12,69,This hospital admission D Outpatient: Series o...,0,0,,71891838.0,...,0,This hospital admission D Outpatient: Series o...,0,0,,1,1,1,0,1
4,4,95581557,DESCRIPTION OF PROCEDURE,3,24,DESCRIPTION OF PROCEDURE,0,0,,,...,0,,0,0,,0,0,0,0,0


In [3]:
included = df[(df['inclusion'] == 1)]
annotated_AB = df[(df['EITHER_AB'] == 1)]
annotated_A = df[(df['ANY_A'] == 1)]
annotated_B = df[(df['ANY_B'] == 1)]

print(f"Total Sentences Detected: {len(df)}")
print(f"Total Included Sentences Detected: {len(included)}; (% total: {len(included)/len(df):.3f})")
print(f"Annotated Either A or B: {len(annotated_AB)}; (% total: {len(annotated_AB)/len(df):.3f}); (% included: {len(annotated_AB)/len(included):.3f})")
print(f"Annotated A: {len(annotated_A)}; (% total: {len(annotated_A)/len(df):.3f}); (% included: {len(annotated_A)/len(included):.3f})")
print(f"Annotated B: {len(annotated_B)};(% total: {len(annotated_B)/len(df):.3f}); (% included: {len(annotated_B)/len(included):.3f})")

print()
N_docs = len(set(df['ICD_doc_id']))

KATHLEEN_A = np.sum(df['KATHLEEN_A'])
KATHLEEN_B = np.sum(df['KATHLEEN_B'])

LIZ_A = np.sum(df['LIZ_A'])
LIZ_B = np.sum(df['LIZ_B'])

KAYCEE_A = np.sum(df['KAYCEE_A'])
KAYCEE_B = np.sum(df['KAYCEE_B'])

print(f"KATHLEEN A: {KATHLEEN_A} ({KATHLEEN_A/N_docs:.3f} per doc), B:{KATHLEEN_B} ({KATHLEEN_B/N_docs:.3f} per doc)")
print(f"LIZ A: {LIZ_A}  ({LIZ_A/N_docs:.3f} per doc), B:{LIZ_B}  ({LIZ_B/N_docs:.3f} per doc)")
print(f"KAYCEE A: {KAYCEE_A}  ({KAYCEE_A/N_docs:.3f} per doc), B:{KAYCEE_B}  ({KAYCEE_B/N_docs:.3f} per doc)")

Total Sentences Detected: 9951
Total Included Sentences Detected: 6399; (% total: 0.643)
Annotated Either A or B: 739; (% total: 0.074); (% included: 0.115)
Annotated A: 636; (% total: 0.064); (% included: 0.099)
Annotated B: 139;(% total: 0.014); (% included: 0.022)

KATHLEEN A: 413 (3.082 per doc), B:56 (0.418 per doc)
LIZ A: 333  (2.485 per doc), B:59  (0.440 per doc)
KAYCEE A: 452  (3.373 per doc), B:29  (0.216 per doc)


In [4]:
KATHLEENvLIZ_A = cohen_kappa_score(included['KATHLEEN_A'], included['LIZ_A'])
KATHLEENvKAYCEE_A = cohen_kappa_score(included['KATHLEEN_A'], included['KAYCEE_A'])
LIZvKAYCEE_A = cohen_kappa_score(included['LIZ_A'], included['KAYCEE_A'])

KATHLEENvLIZ_B = cohen_kappa_score(included['KATHLEEN_B'], included['LIZ_B'])
KATHLEENvKAYCEE_B = cohen_kappa_score(included['KATHLEEN_B'], included['KAYCEE_B'])
LIZvKAYCEE_B = cohen_kappa_score(included['KAYCEE_B'], included['LIZ_B'])

print(f"KATHLEENvLIZ_A: {KATHLEENvLIZ_A:.4f}")
print(f"KATHLEENvKAYCEE_A: {KATHLEENvKAYCEE_A:.4f}")
print(f"LIZvKAYCEE_A: {LIZvKAYCEE_A:.4f}")
print(f"KATHLEENvLIZ_B: {KATHLEENvLIZ_B:.4f}")
print(f"KATHLEENvKAYCEE_B: {KATHLEENvKAYCEE_B:.4f}")
print(f"LIZvKAYCEE_B: {LIZvKAYCEE_B:.4f}")

KATHLEENvLIZ_A: 0.6131
KATHLEENvKAYCEE_A: 0.5921
LIZvKAYCEE_A: 0.6626
KATHLEENvLIZ_B: 0.0436
KATHLEENvKAYCEE_B: 0.0177
LIZvKAYCEE_B: 0.0168


In [5]:
KATHLEENvLIZ_A = cohen_kappa_score(included['KATHLEEN_A'] + 1, included['LIZ_A'] + 1)
KATHLEENvKAYCEE_A = cohen_kappa_score(included['KATHLEEN_A'] + 1, included['KAYCEE_A'] + 1)
LIZvKAYCEE_A = cohen_kappa_score(included['LIZ_A'] + 1, included['KAYCEE_A'] + 1)

KATHLEENvLIZ_B = cohen_kappa_score(included['KATHLEEN_B'] + 1, included['LIZ_B'] + 1)
KATHLEENvKAYCEE_B = cohen_kappa_score(included['KATHLEEN_B'] + 1, included['KAYCEE_B'] + 1)
LIZvKAYCEE_B = cohen_kappa_score(included['KAYCEE_B'] + 1, included['LIZ_B'] + 1)

print(f"KATHLEENvLIZ_A: {KATHLEENvLIZ_A:.4f}")
print(f"KATHLEENvKAYCEE_A: {KATHLEENvKAYCEE_A:.4f}")
print(f"LIZvKAYCEE_A: {LIZvKAYCEE_A:.4f}")
print(f"KATHLEENvLIZ_B: {KATHLEENvLIZ_B:.4f}")
print(f"KATHLEENvKAYCEE_B: {KATHLEENvKAYCEE_B:.4f}")
print(f"LIZvKAYCEE_B: {LIZvKAYCEE_B:.4f}")

KATHLEENvLIZ_A: 0.6131
KATHLEENvKAYCEE_A: 0.5921
LIZvKAYCEE_A: 0.6626
KATHLEENvLIZ_B: 0.0436
KATHLEENvKAYCEE_B: 0.0177
LIZvKAYCEE_B: 0.0168
