In [20]:
from sklearn.metrics import cohen_kappa_score
import json
import spacy
from tqdm import tqdm
import os

### Inter-annotator agreement among human annotators

In [26]:
def annotations_to_bio(text, annotations):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    # Initialize BIO tags
    bio_tags = ['O'] * len(doc)
    
    for entity in annotations:
        label = entity['label']
        start = entity['start_offset']
        end = entity['end_offset']
        
        # Find tokens that overlap with the entity
        for token in doc:
            if start <= token.idx < end:
                if token.idx == start:
                    bio_tags[token.i] = 'B-' + label
                else:
                    bio_tags[token.i] = 'I-' + label
    
    # Combine tokens with BIO tags
    bio_format = []
    for token, tag in zip(doc, bio_tags):
        bio_format.append((tag, token.text))
    
    return bio_format

In [27]:
lalit_annotations = []
with open('/project_resources/private_workspace/naimish/de-id-annotations/inter-annotator-aggrement/lalit.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        bio_formatted = annotations_to_bio(data['text'], data['entities'])
        lalit_annotations.append([tuple[0] for tuple in bio_formatted])

In [28]:
vandana_annotations = []
with open('/project_resources/private_workspace/naimish/de-id-annotations/inter-annotator-aggrement/vandana.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        bio_formatted = annotations_to_bio(data['text'], data['entities'])
        vandana_annotations.append([tuple[0] for tuple in bio_formatted])

In [29]:
lalit_annotations_flattened = [label for doc in lalit_annotations for label in doc]
vandana_annotations_flattened = [label for doc in vandana_annotations for label in doc]

# Calculate Cohen's Kappa
kappa = round(cohen_kappa_score(lalit_annotations_flattened , vandana_annotations_flattened),3)

print(f"Cohen's Kappa coefficient: {kappa}")

Cohen's Kappa coefficient: 0.921


### Inter-annotator agreement among physician (Discharge summaries classified into Real/Synthetic)

In [21]:
labels_user1 = []
text_user1 = []
with open('/project_resources/private_workspace/naimish/user1.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        text_user1.append(data['text'])
        labels_user1.append(data['label'][0])
        
path_to_folders = "/project_resources/private_workspace/naimish/60_Sampled_discharge_summaries/Sampled_summaries/"
predicted_labels_user1 = {}
files = os.listdir(path_to_folders)
for file in files:
    file_name = os.path.join(path_to_folders, file)
    with open(file_name , 'r') as file:
        file_contents = file.read()
        predicted_labels_user1[file_name.split('/')[-1]] = labels_user1[text_user1.index(file_contents)]
        
predicted_labels_user1 = dict(sorted(predicted_labels_user1.items()))

In [22]:
labels_user2 = []
text_user2 = []
with open('/project_resources/private_workspace/naimish/user2.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        text_user2.append(data['text'])
        labels_user2.append(data['label'][0])
        
path_to_folders = "/project_resources/private_workspace/naimish/60_Sampled_discharge_summaries/Sampled_summaries/"
predicted_labels_user2 = {}
files = os.listdir(path_to_folders)
for file in files:
    file_name = os.path.join(path_to_folders, file)
    with open(file_name , 'r') as file:
        file_contents = file.read()
        predicted_labels_user2[file_name.split('/')[-1]] = labels_user2[text_user2.index(file_contents)]
        
predicted_labels_user2 = dict(sorted(predicted_labels_user2.items()))

In [24]:
kappa = round(cohen_kappa_score(list(predicted_labels_user1.values()),list(predicted_labels_user2.values())),3)
print(f"Cohen's kappa coefficient: {kappa}")

Cohen's kappa coefficient: 0.29
