In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
denis = pd.read_json("denis.json")
slava = pd.read_json("slava.json")
gena = pd.read_json("gena.json")

In [3]:
def preproc_json(df: pd.DataFrame, slava_texts=False) -> pd.DataFrame:

    dfs = []
    for id_ in df.id:
        if slava_texts:
            id_ -= 800
        try:
            temp_df = pd.json_normalize(
                              pd.json_normalize(
                                  pd.json_normalize(
                                      pd.json_normalize(df.annotations)[0]
                                  ).result
                              ).T[id_-1])[["value.text", "value.labels"]].dropna()
            temp_df["value.labels"] = temp_df["value.labels"].astype(str)
            temp_df["value.text"] = temp_df["value.text"].str.lower().str.strip()
            temp_df["id"] = id_
            dfs.append(temp_df)
        except:
            pass
    return pd.concat(dfs, ignore_index=True)

In [4]:
preproc_denis = preproc_json(denis)
preproc_slava = preproc_json(slava, slava_texts=True)
preproc_gena = preproc_json(gena)

In [5]:
def get_text_annotators(preproc_denis: pd.DataFrame, 
                        preproc_gena: pd.DataFrame,
                        preproc_slava: pd.DataFrame,
                        id_: int) -> pd.DataFrame:

    t1 = preproc_denis[preproc_denis["id"] == id_].copy()
    t2 = preproc_gena[preproc_gena["id"] == id_].copy()
    t3 = preproc_slava[preproc_slava["id"] == id_].copy()

    text_df = t1.merge(t2.merge(t3, how="cross").drop_duplicates(), how="cross").drop_duplicates()
    text_df = text_df[(text_df["value.text"] == text_df["value.text_x"])
                      | (text_df["value.text"] == text_df["value.text_y"])
                      | (text_df["value.text_x"] == text_df["value.text_y"])]

    text_df["word"] = text_df[['value.text', 'value.text_x', 'value.text_y']].mode(axis=1)
    text_df["word_count"] = text_df[['value.text', 'value.text_x', 'value.text_y']]\
                            .apply(pd.Series.value_counts, axis=1).max(axis=1)
    text_df = text_df.sort_values(by=["word", "word_count"], ascending=False).drop_duplicates(subset="word")

    text_df["value.labels"] = np.where((text_df["value.text"] != text_df["value.text_x"])
                                    & (text_df["value.text"] != text_df["value.text_y"]), None, text_df["value.labels"])

    text_df["value.labels_x"] = np.where((text_df["value.text_x"] != text_df["value.text"])
                                   & (text_df["value.text_x"] != text_df["value.text_y"]), None, text_df["value.labels_x"])

    text_df["value.labels_y"] = np.where((text_df["value.text_y"] != text_df["value.text"])
                                    & (text_df["value.text_y"] != text_df["value.text_x"]), None, text_df["value.labels_y"])

    text_df = text_df.sort_values(by=["word", "word_count"], ascending=False).drop_duplicates(subset="word")

    text_df = text_df[["id", "word", "value.labels", "value.labels_x", "value.labels_y"]]\
              .rename(columns = {"value.labels": "denis_annotation", 
                                 "value.labels_x": "gennady_annotation",
                                 "value.labels_y": "vyacheslav_annotation"})
    return text_df

def final_annotators(preproc_denis: pd.DataFrame, 
                     preproc_gena: pd.DataFrame,
                     preproc_slava: pd.DataFrame)-> pd.DataFrame:
    dfs = []
    for id_ in tqdm(preproc_denis.id.unique()):
        try:
            dfs.append(get_text_annotators(preproc_denis, preproc_gena, preproc_slava, id_))
        except:
            pass
    return pd.concat(dfs, ignore_index=True)

In [6]:
annotators_df = final_annotators(preproc_denis, preproc_gena, preproc_slava)

  0%|          | 0/29 [00:00<?, ?it/s]

In [7]:
annotators_df

Unnamed: 0,id,word,denis_annotation,gennady_annotation,vyacheslav_annotation
0,1,нож,['PRODUCT'],['PRODUCT'],
1,1,на мели,['LOC'],['LOC'],['LOC']
2,2,свеча,['PRODUCT'],['PRODUCT'],
3,2,мире,['LOC'],['LOC'],
4,2,между строчек,,['LOC'],['LOC']
...,...,...,...,...,...
115,28,в песчаных белых снегах,['LOC'],,['LOC']
116,29,на окнах,['LOC'],['LOC'],['LOC']
117,29,в слезах,,['LOC'],['LOC']
118,29,в прошлом,,['LOC'],['LOC']


In [8]:
from sklearn.preprocessing import OrdinalEncoder

categories = list(set(annotators_df.denis_annotation.drop_duplicates().tolist()+\
             annotators_df.gennady_annotation.drop_duplicates().tolist()+\
             annotators_df.vyacheslav_annotation.drop_duplicates().tolist()))

encoder = OrdinalEncoder()
encoder.fit(np.array(categories).reshape(-1, 1))

OrdinalEncoder()

In [9]:
annotators_df = annotators_df.copy()
annotators_df["denis_annotation_labeled"] = encoder.transform(annotators_df.denis_annotation.values.reshape(-1, 1))
annotators_df["gennady_annotation_labeled"] = encoder.transform(annotators_df.gennady_annotation.values.reshape(-1, 1))
annotators_df["vyacheslav_annotation_labeled"] = encoder.transform(annotators_df.vyacheslav_annotation.values.reshape(-1, 1))

In [10]:
import krippendorff

krippendorff.alpha(reliability_data=annotators_df[["denis_annotation_labeled", 
                                                   "gennady_annotation_labeled", 
                                                   "vyacheslav_annotation_labeled"]], level_of_measurement='nominal')

0.05105170656944269

In [11]:
annotators_df = annotators_df.dropna()

In [12]:
matrix = annotators_df[["denis_annotation_labeled", 
                        "gennady_annotation_labeled", 
                        "vyacheslav_annotation_labeled"]].corr()

agreement = matrix.values[np.triu_indices_from(matrix.values, k=1)].mean()

# результат
print("Среднее значение согласия аннотаторов: ", agreement)

Среднее значение согласия аннотаторов:  0.8698213824410473


In [13]:
from sklearn.metrics import cohen_kappa_score


kappa_1 = cohen_kappa_score(annotators_df.denis_annotation_labeled, 
                          annotators_df.gennady_annotation_labeled, weights=None)

kappa_2 = cohen_kappa_score(annotators_df.denis_annotation_labeled, 
                            annotators_df.vyacheslav_annotation_labeled, weights=None)

kappa_3 = cohen_kappa_score(annotators_df.gennady_annotation_labeled, 
                            annotators_df.vyacheslav_annotation_labeled, weights=None)

kappa = (kappa_1+kappa_2+kappa_3)/3

print("Cohen's kappa coefficient:", kappa)

Cohen's kappa coefficient: 0.788650375925894
