In [None]:
import json
from pathlib import Path
import pandas as pd

In [None]:
path = Path('/home//.../final_corpus')
for json_file in path.glob('*.json'):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for annotator in ['student_annotator_1', 'student_annotator_2', 'expert_annotator_1', 'expert_annotator_2']:
        if not data.get(annotator):
            continue
        if data.get(annotator).startswith('2_'):
            data[annotator] = '2_nebenthema'

    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
annotation_dataframe = {}

for json_file in path.glob('*.json'):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # get all annotators
    annotators = {
        'student_annotator_1': data.get('student_annotator_1'),
        'student_annotator_2': data.get('student_annotator_2'),
        'expert_annotator_1': data.get('expert_annotator_1'),
        'expert_annotator_2': data.get('expert_annotator_2')
    }

    # remove None values
    annotators = {k: v for k, v in annotators.items() if v is not None}
    
    annotation_dataframe[json_file.stem] = annotators

df = pd.DataFrame.from_dict(annotation_dataframe, orient='index')


In [None]:
import pandas as pd
import numpy as np
from dawid_skene_model import list2array
from dawid_skene_model import DawidSkeneModel

# Define mapping from text labels to numeric classes
label_mapping = {
    '1_hauptthema': 0,
    '2_nebenthema': 1,
    '3_kein_thema': 2
}
reverse_mapping = {v: k for k, v in label_mapping.items()}
class_num = len(label_mapping)

# Transform DataFrame to the correct format for list2array
dataset_list = []
items = []

for item in df.index:
    # For each item/task, create a list of annotations per worker
    worker_annotations = [[] for _ in range(len(df.columns))]
    has_annotations = False

    for worker_idx, worker in enumerate(df.columns):
        label = df.loc[item, worker]
        if pd.notna(label):
            # Add the class index to this worker's annotations
            worker_annotations[worker_idx].append(label_mapping[label])
            has_annotations = True

    if has_annotations:
        dataset_list.append(worker_annotations)
        items.append(item)

# Run Dawid-Skene model
dataset_tensor = list2array(class_num, dataset_list)
model = DawidSkeneModel(class_num, max_iter=45, tolerance=10e-100)
marginal_predict, error_rates, worker_reliability, predict_label = model.run(dataset_tensor)

result_df = df.copy()
result_df['dawid_skene_label'] = np.nan

# Add Dawid-Skene predictions to the DataFrame
for i, item in enumerate(items):
    # Get the class with highest probability from predict_label
    label_index = np.argmax(predict_label[i])
    result_df.loc[item, 'dawid_skene_label'] = reverse_mapping[label_index]

    for class_idx in range(class_num):
        col_name = f'prob_{reverse_mapping[class_idx]}'
        result_df.loc[item, col_name] = predict_label[i, class_idx]

print(result_df)

In [None]:
# Let's see who's most reliable...
print(worker_reliability)

In [None]:
# Check for disagreements between Dawid-Skene and expert annotators
result_df['expert_agreement'] = pd.NA
expert_cols = ['expert_annotator_1', 'expert_annotator_2']

for idx in result_df.index:
    ds_label = result_df.loc[idx, 'dawid_skene_label']
    if pd.isna(ds_label):
        continue
    expert_annotations = [result_df.loc[idx, col] for col in expert_cols if pd.notna(result_df.loc[idx, col])]

    if expert_annotations:
        result_df.loc[idx, 'expert_agreement'] = ds_label in expert_annotations

disagreement_count = sum(result_df['expert_agreement'] == False)
print(f"Number of disagreements between Dawid-Skene and expert annotators: {disagreement_count}")

disagreements = result_df[result_df['expert_agreement'] == False]
print("\nItems where experts disagree with Dawid-Skene:")
print(disagreements[expert_cols + ['dawid_skene_label']])

no_expert_count = sum(result_df['expert_agreement'].isna() & result_df['dawid_skene_label'].notna())
print(f"\nNumber of items with Dawid-Skene label but no expert annotations: {no_expert_count}")

In [None]:
raise ValueError("Are you sure you want to write into the files?")
# add dawid_skene labels to json files
for json_file in path.glob('*.json'):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    filename = json_file.stem

    if filename in result_df.index:
        data['gold_label'] = result_df.loc[filename, 'dawid_skene_label']
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)