### Prepping data:

In [1]:
import re
from typing import List, Dict, Optional

HEADER_RE = re.compile(
    r"^\s*\[(?P<span>\d+-\d+)\]\s*(?P<label>[^,]+),\s*File:\s*(?P<filename>[^,]+)",
    flags=re.IGNORECASE
)

def extract_ann_text(path: str, encoding: str = "utf-8"):
    """
    Read `path` and return a list of dicts, each dict:
      {
        "span": "836-1053",
        "filename": "paper_16.txt",
        "label": "Coherence",
        "body": "the text after the header up to the next header (multi-line string)"
      }
    """
    with open(path, "r", encoding=encoding) as fh:
        lines = fh.readlines()

    # Find indices of header lines and capture groups
    headers = []  # list of tuples (index, matchobj)
    for i, line in enumerate(lines):
        m = HEADER_RE.match(line)
        if m:
            headers.append((i, m))

    results: List[Dict[str, Optional[str]]] = []
    if not headers:
        return results

    # For each found header, slice the following lines until next header (or EOF)
    for idx, (line_idx, match) in enumerate(headers):
        next_idx = headers[idx + 1][0] if (idx + 1) < len(headers) else len(lines)

        # Extract body: lines after the header line up to next header index
        body_lines = [ln.rstrip("\n") for ln in lines[line_idx + 1 : next_idx]]
        body = "\n".join(body_lines).strip() or None

        span = match.group("text").strip() if match.group("text") else None
        start_span = int(span.split("-")[0])
        end_span = int(span.split("-")[1])
        filename = match.group("file").strip() if match.group("file") else None
        label = match.group("label").strip() if match.group("label") else None

        results.append({
            "filename": filename,
            "start": start_span,
            "end": end_span,
            "label": label,
            "text": body, 
        })
        print(results)
    return results

In [None]:
import os 
import json

annotation_folder = "../../annotated_data/citation_annotations/annotations"
total_anns = {}
papers = [11, 13, 14, 15, 16, 17, 18, 19, 20, 100] # read the annotated overlapping paper IDs 

for ann in os.listdir(annotation_folder):
    total_anns[ann] = []
    for file in os.listdir(os.path.join(annotation_folder, ann)):
        if file.endswith(".json") and int(file.split("_")[1].split(".")[0]) in papers:
            file_path = os.path.join(annotation_folder, ann, file)
            with open(file_path, "r", encoding="utf-8") as f:
                annotations = json.load(f)
            total_anns[ann].extend(annotations)

    print(f"Extracted {len(total_anns[ann])} annotations from {ann}")

Extracted 36 annotations from Ed
Extracted 46 annotations from Ekaterina
Extracted 43 annotations from Iman
Extracted 41 annotations from Iraa


In [3]:
total_anns

{'Ed': [{'file': 'paper_100.txt',
   'start': 594,
   'end': 966,
   'label': 'Coherence',
   'user': 'Ed',
   'text': 'Zhang et al. (2019) improves an LSTM-\nbased encoder-decoder model with online vocabulary adaptation. For abbreviated pinyin, CoCAT (Huang et al., 2015) uses machine translation technology to reduce the number of the typing letters. Huang and Zhao (2018) propose an LSTM-based encoder-decoder approach with the concatenation of context words and abbreviated pinyin as input'},
  {'file': 'paper_100.txt',
   'start': 1110,
   'end': 1646,
   'label': 'Coherence',
   'user': 'Ed',
   'text': 'In addition, there are some works handling\npinyin with typing errors. Chen and Lee (2000) investigate a typing model which handles spelling correction in sentence-based pinyin input method. CHIME (Zheng et al., 2011) is a error-tolerant Chinese pinyin input method. It finds similar pinyin which will be further ranked with Chinese specific features. Jia and Zhao (2014) propose a joint

Grouping based on category:

In [4]:
from collections import defaultdict
from typing import Dict, List, Any

def group_by_paper_and_category(
    annotator_data: Dict[str, List[Dict[str, Any]]]
):
    """
    annotator_data:
      {
        "Ed": [ {file, start, end, label, ...}, ... ],
        "Iman": [...],
        ...
      }

    Returns:
      grouped[paper][category][annotator] -> list of annotation dicts
    """
    grouped = defaultdict(
        lambda: defaultdict(lambda: defaultdict(list))
    )

    for annotator, anns in annotator_data.items():
        for ann in anns:
            paper = ann["file"]
            category = ann["label"]
            grouped[paper][category][annotator].append(ann)

    return grouped

grouped = group_by_paper_and_category(total_anns)

In [None]:
path = '../../annotated_data/overlapping_papers'

for paper, categories in grouped.items():
    for category, ann_by_annotator in categories.items():
        with open(os.path.join(path, f"{paper.split('.txt')[0]}_{category}.json"), "w", encoding="utf-8") as f:
            json.dump(ann_by_annotator, f, indent=2)

### Pairwise F1 token overlap:

In [22]:
from typing import Dict, List, Tuple, Any
from itertools import permutations

Span = Tuple[int, int]  # (start, end), end-exclusive


def span_iou(a: Span, b: Span) -> float:
    inter_start = max(a[0], b[0])
    inter_end = min(a[1], b[1])
    inter = max(0, inter_end - inter_start)
    if inter == 0:
        return 0.0
    union = (a[1] - a[0]) + (b[1] - b[0]) - inter
    return inter / union


def match_spans_overlap(
    gold: List[Span],
    pred: List[Span],
    iou_threshold: float = 0.5,
) -> Tuple[int, int, int]:
    """
    One-directional matching:
      gold = ground truth
      pred = predictions

    Returns (tp, fp, fn)
    """
    used_gold = set()
    tp = 0

    for p in pred:
        best_iou = 0.0
        best_idx = None
        for i, g in enumerate(gold):
            if i in used_gold:
                continue
            iou = span_iou(g, p)
            if iou >= iou_threshold and iou > best_iou:
                best_iou = iou
                best_idx = i

        if best_idx is not None:
            tp += 1
            used_gold.add(best_idx)

    fp = len(pred) - tp
    fn = len(gold) - tp
    return tp, fp, fn

def pairwise_micro_f1_across_annotators(
    annotator_spans: Dict[str, Dict[str, List[Span]]],
    *,
    iou_threshold: float = 0.5,
) -> Dict[str, Any]:
    """
    annotator_spans:
      annotator -> { doc_id -> [spans] }

    Returns:
      micro-averaged precision/recall/F1 across all ordered annotator pairs
    """
    total_tp = total_fp = total_fn = 0
    pair_details = {}

    annotators = list(annotator_spans.keys())

    for gold_ann, pred_ann in permutations(annotators, 2):
        tp = fp = fn = 0

        gold_docs = annotator_spans[gold_ann]
        pred_docs = annotator_spans[pred_ann]
        doc_ids = set(gold_docs.keys()) | set(pred_docs.keys())

        for d in doc_ids:
            gold_spans = gold_docs.get(d, [])
            pred_spans = pred_docs.get(d, [])
            t, f_p, f_n = match_spans_overlap(
                gold_spans,
                pred_spans,
                iou_threshold=iou_threshold,
            )
            tp += t
            fp += f_p
            fn += f_n

        total_tp += tp
        total_fp += fp
        total_fn += fn

        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0

        pair_details[(gold_ann, pred_ann)] = {
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) else 0.0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) else 0.0
    micro_f1 = (
        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
        if (micro_precision + micro_recall)
        else 0.0
    )

    return {
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "total_tp": total_tp,
        "total_fp": total_fp,
        "total_fn": total_fn,
        "pairwise_details": pair_details,
        "num_ordered_pairs": len(pair_details),
    }


In [26]:
categories = ['Unsupported claim', 'Lacks synthesis', 'Format', 'Coherence']
pairwise_f1_total = {'Unsupported claim': 0, 'Lacks synthesis': 0, 'Format': 0, 'Coherence': 0}

def process_ann_files(ann_data, label):
    spans_by_file = {}
    for d in ann_data:
        if d["label"] == label:
            fn = d["file"]
            s = int(d["start"])
            e = int(d["end"])
            
            if fn not in spans_by_file:
                spans_by_file[fn] = []
            spans_by_file[fn].append((s, e))

    return spans_by_file

for category in categories:
    ann1_spans = process_ann_files(total_anns['Iman'], category)
    ann2_spans = process_ann_files(total_anns['Ekaterina'], category)
    ann3_spans = process_ann_files(total_anns['Iraa'], category)
    ann4_spans = process_ann_files(total_anns['Ed'], category)

    ann_spans = {
        "ann1": ann1_spans,
        "ann2": ann2_spans,
        "ann3": ann3_spans,
        "ann4": ann4_spans,
    }

    pres = pairwise_micro_f1_across_annotators(
    ann_spans,
    iou_threshold=0.5,
)
    pairwise_f1_total[category] = pres

for category in pairwise_f1_total:
    print(f"Category: {category}")
    pairwise_results = pairwise_f1_total[category]
    print(round(pairwise_results["micro_f1"],4))
    # for (a, b), stats in pairwise_results.items():
    #     print(a, b, round(stats["f1"], 4))
    # print('\n')

Category: Unsupported claim
0.3333
Category: Lacks synthesis
0.4872
Category: Format
0.6897
Category: Coherence
0.3678


### Krippendorf's alpha

In [None]:
import re
_WORD_RE = re.compile(r"\S+")

def tokenize_with_offsets(text):
    return [(m.start(), m.end()) for m in _WORD_RE.finditer(text)]

def spans_to_token_labels(text, spans, inclusive_end=False):
    """
    Returns a list of 0/1 labels per token.
    1 = token overlaps any span
    """
    tokens = tokenize_with_offsets(text)
    labels = [0] * len(tokens)

    for s, e in spans:
        if inclusive_end:
            e += 1
        for i, (ts, te) in enumerate(tokens):
            if not (te <= s or ts >= e):
                labels[i] = 1
    return labels

def build_annotation_matrix(texts, annotator_spans, text_key="text", file_key="filename", inclusive_end=False):
    """
    texts: list of dicts with at least {file_key: filename, text_key: text}
    annotator_spans: dict[ann_id][filename] -> list of (start,end) spans
    """
    annotators = list(annotator_spans.keys())
    matrix = []

    for d in texts:
        fn = d[file_key]
        text = d[text_key]          # <-- string
        tokens = tokenize_with_offsets(text)

        # precompute per-annotator token labels (or None if missing)
        per_ann_labels = {}
        for ann in annotators:
            spans = annotator_spans[ann].get(fn)  # filename key
            if spans is None:
                per_ann_labels[ann] = None
            else:
                per_ann_labels[ann] = spans_to_token_labels(text, spans, inclusive_end=inclusive_end)

        # one row per token
        for t in range(len(tokens)):
            row = []
            for ann in annotators:
                lbls = per_ann_labels[ann]
                row.append(None if lbls is None else lbls[t])
            matrix.append(row)

    return matrix

from collections import Counter

def krippendorffs_alpha_nominal(annotation_matrix):
    """
    annotation_matrix: list of lists
    Each row = unit, each column = annotator
    Labels are categorical (0/1), None allowed
    """

    # Remove units with fewer than 2 annotations
    units = [
        [v for v in row if v is not None]
        for row in annotation_matrix
        if sum(v is not None for v in row) >= 2
    ]

    if not units:
        return float("nan")

    # Observed disagreement
    Do = 0.0
    n_pairs = 0

    for u in units:
        for i in range(len(u)):
            for j in range(i + 1, len(u)):
                n_pairs += 1
                Do += 0 if u[i] == u[j] else 1

    Do /= n_pairs

    # Expected disagreement
    all_labels = [v for u in units for v in u]
    label_counts = Counter(all_labels)
    N = sum(label_counts.values())

    De = 0.0
    for l1, c1 in label_counts.items():
        for l2, c2 in label_counts.items():
            if l1 != l2:
                De += c1 * c2

    De /= (N * (N - 1))

    if De == 0:
        return 1.0

    return 1 - (Do / De)


In [None]:
import os 

texts = []

path = "../../to_annotate"
for text in os.listdir(path):
    if 'paper' in text and text.endswith(".txt"):
        with open(os.path.join(path, text), "r", encoding="utf-8") as f:
            texts.append({"filename": text, 'text': (f.read()).split("References")[0]})

In [9]:
def process_ann_files(ann_data, label):
    spans_by_file = {}
    for d in ann_data:
        if d["label"] == label:
            fn = d["file"]
            s = int(d["start"])
            e = int(d["end"])
            
            if fn not in spans_by_file:
                spans_by_file[fn] = []
            spans_by_file[fn].append((s, e))

    return spans_by_file

In [10]:
categories = ['Unsupported claim', 'Lacks synthesis', 'Format', 'Coherence']
kripp_total = {'Unsupported claim': 0, 'Lacks synthesis': 0, 'Format': 0, 'Coherence': 0}

for category in categories:
    ann_spans_unsupp = {
        "ann1": process_ann_files(total_anns['Iman'], category),
        "ann2": process_ann_files(total_anns['Ekaterina'], category),
        "ann3": process_ann_files(total_anns['Iraa'], category),
        "ann4": process_ann_files(total_anns['Ed'], category),
    }

    matrix = build_annotation_matrix(texts, ann_spans_unsupp)
    alpha = krippendorffs_alpha_nominal(matrix)
    kripp_total[category] = alpha

print("Krippendorff's α for each category:")
for category, alpha in kripp_total.items():
    print(f"{category}: {round(alpha, 4)}")

Krippendorff's α for each category:
Unsupported claim: 0.3551
Lacks synthesis: 0.4375
Format: 0.8347
Coherence: 0.305


In [11]:
import re
from typing import Dict, List, Tuple, Any, Optional

Span = Tuple[int, int]  # (start, end) end-exclusive by default
_WORD_RE = re.compile(r"\S+")


def tokenize_with_offsets(text: str) -> List[Tuple[int, int]]:
    """Whitespace tokens with (start,end_exclusive) offsets."""
    return [(m.start(), m.end()) for m in _WORD_RE.finditer(text)]


def spans_to_token_labels(
    text: str,
    spans: List[Span],
    inclusive_end: bool = False,
) -> List[int]:
    """
    Convert spans -> token-level binary labels.
    Label = 1 if token overlaps any span, else 0.
    """
    toks = tokenize_with_offsets(text)
    labels = [0] * len(toks)

    for s, e in spans:
        if inclusive_end:
            e = e + 1
        for i, (ts, te) in enumerate(toks):
            if not (te <= s or ts >= e):  # overlap
                labels[i] = 1
    return labels


def fleiss_kappa(counts: List[List[int]]) -> float:
    """
    Fleiss' kappa from a matrix of counts per unit:
      counts[u] = [n_cat0, n_cat1, ..., n_catK-1]
    Requires constant number of raters per unit.

    Returns kappa in [-1, 1].
    """
    if not counts:
        return float("nan")

    N = len(counts)          # units
    k = len(counts[0])       # categories
    n = sum(counts[0])       # raters per unit

    # Validate constant n and shape
    for row in counts:
        if len(row) != k:
            raise ValueError("All rows must have same number of categories")
        if sum(row) != n:
            raise ValueError("Fleiss requires same number of ratings per unit")

    # p_j = proportion of all assignments to category j
    total = N * n
    p = [0.0] * k
    for row in counts:
        for j in range(k):
            p[j] += row[j]
    p = [pj / total for pj in p]

    # P_i = extent of agreement for unit i
    P = []
    for row in counts:
        s = sum(c * (c - 1) for c in row)
        P.append(s / (n * (n - 1)))

    P_bar = sum(P) / N
    P_e = sum(pj * pj for pj in p)

    if P_e == 1.0:
        return 1.0

    return (P_bar - P_e) / (1.0 - P_e)


def build_fleiss_counts_from_spans(
    texts: List[Dict[str, Any]],
    annotator_spans: Dict[str, Dict[str, List[Span]]],
    *,
    file_key: str = "file",
    text_key: str = "text",
    inclusive_end: bool = False,
    require_all: bool = True,
) -> List[List[int]]:
    """
    Build Fleiss count matrix from span annotations.

    texts: list of dicts with {file_key, text_key}
    annotator_spans: ann_id -> { filename -> [(s,e), ...] }

    Returns counts per token-unit: [n_out, n_in] for each token.

    If require_all=True: only include docs where ALL annotators have spans (possibly empty list).
      Missing doc for any annotator => doc skipped.
    If require_all=False: tokens with missing labels are skipped (not strict Fleiss).
    """
    annotators = list(annotator_spans.keys())
    counts = []

    for d in texts:
        fn = d[file_key]
        text = d[text_key]

        # Check doc availability
        if require_all:
            if any(fn not in annotator_spans[ann] for ann in annotators):
                continue

        # Build token labels per annotator (or None if missing)
        per_ann_labels: Dict[str, Optional[List[int]]] = {}
        for ann in annotators:
            spans = annotator_spans[ann].get(fn)
            if spans is None:
                per_ann_labels[ann] = None
            else:
                per_ann_labels[ann] = spans_to_token_labels(
                    text, spans, inclusive_end=inclusive_end
                )

        # number of tokens (use any available annotator or tokenize directly)
        tok_offsets = tokenize_with_offsets(text)
        T = len(tok_offsets)

        for t in range(T):
            labels_t = []
            for ann in annotators:
                lbls = per_ann_labels[ann]
                if lbls is None:
                    labels_t.append(None)
                else:
                    labels_t.append(lbls[t])

            if require_all:
                # all 4 are present by construction
                n_in = sum(labels_t)  # labels are 0/1
                n_out = len(labels_t) - n_in
                counts.append([n_out, n_in])
            else:
                # drop this token if any missing label
                if any(v is None for v in labels_t):
                    continue
                n_in = sum(labels_t)
                n_out = len(labels_t) - n_in
                counts.append([n_out, n_in])

    return counts


def fleiss_kappa_from_spans(
    texts: List[Dict[str, Any]],
    annotator_spans: Dict[str, Dict[str, List[Span]]],
    *,
    file_key: str = "filename",
    text_key: str = "text",
    inclusive_end: bool = False,
    require_all: bool = True,
) -> Dict[str, Any]:
    """
    Convenience wrapper: builds counts and computes Fleiss' kappa.
    """
    counts = build_fleiss_counts_from_spans(
        texts,
        annotator_spans,
        file_key=file_key,
        text_key=text_key,
        inclusive_end=inclusive_end,
        require_all=require_all,
    )
    kappa = fleiss_kappa(counts)
    return {
        "fleiss_kappa": kappa,
        "num_token_units": len(counts),
        "require_all": require_all,
        "num_annotators": len(annotator_spans),
    }


In [51]:
total_fleiss_kappa = {}

for category in categories:
    ann_spans_unsupp = {
        "ann1": process_ann_files(total_anns['Iman'], category),
        "ann2": process_ann_files(total_anns['Ekaterina'], category),
        "ann3": process_ann_files(total_anns['Iraa'], category),
        "ann4": process_ann_files(total_anns['Ed'], category),
    }
    res = fleiss_kappa_from_spans(
        texts,
        ann_spans_unsupp,
        require_all=True,      # strict Fleiss
        inclusive_end=True,   # set True if your end indices are inclusive
    )

    total_fleiss_kappa[category] = res

for category in total_fleiss_kappa:
    print(f"Category: {category}: ")
    print(f"Fleiss' kappa: {round(total_fleiss_kappa[category]['fleiss_kappa'], 4)}")
    print(f"Number of token units: {total_fleiss_kappa[category]['num_token_units']}")
    print('\n')

Category: Unsupported claim: 
Fleiss' kappa: 0.4091
Number of token units: 3492


Category: Lacks synthesis: 
Fleiss' kappa: 0.2222
Number of token units: 998


Category: Format: 
Fleiss' kappa: 0.8561
Number of token units: 1432


Category: Coherence: 
Fleiss' kappa: 0.3154
Number of token units: 1385




### Cohens Kappa:

In [18]:
import re
from itertools import combinations
from collections import defaultdict
from typing import Dict, List, Tuple, Any
from sklearn.metrics import cohen_kappa_score

Span = Tuple[int, int]
_WORD_RE = re.compile(r"\S+")

def tokenize_with_offsets(text: str):
    return [(m.start(), m.end()) for m in _WORD_RE.finditer(text)]

def spans_to_token_labels(text: str, spans: List[Span], inclusive_end: bool = False) -> List[int]:
    tokens = tokenize_with_offsets(text)
    labels = [0] * len(tokens)
    for s, e in spans:
        if inclusive_end:
            e += 1
        for i, (ts, te) in enumerate(tokens):
            if not (te <= s or ts >= e):
                labels[i] = 1
    return labels

def group_spans_by_file_for_label(
    ann_list: List[Dict[str, Any]],
    label: str,
) -> Dict[str, List[Span]]:
    out = defaultdict(list)
    for d in ann_list:
        if d.get("label") != label:
            continue
        s, e = int(d["start"]), int(d["end"])
        if s >= 0 and e > s:
            out[d["file"]].append((s, e))
    return dict(out)

def mean_pairwise_cohens_kappa_by_category(
    texts: List[Dict[str, Any]],                 # list of {"filename","text"}
    total_anns: Dict[str, List[Dict[str, Any]]], # {"Ed":[...], ...}
    categories: List[str],
    *,
    inclusive_end: bool = False,
    filename_key: str = "filename",
    text_key: str = "text",
) -> Dict[str, Any]:
    """
    Returns:
      {
        category: {
          "pairwise_kappa": {(annA,annB): kappa, ...},
          "mean_kappa": float,
          "token_counts": {(annA,annB): N_tokens, ...},
          "num_pairs": int
        },
        ...
      }
    """
    annotators = list(total_anns.keys())
    results = {}

    for category in categories:
        # spans_by_ann[ann][filename] -> spans for this category
        spans_by_ann = {
            ann: group_spans_by_file_for_label(anns, category)
            for ann, anns in total_anns.items()
        }

        pairwise_kappa = {}
        token_counts = {}

        for a, b in combinations(annotators, 2):
            y_a, y_b = [], []

            for rec in texts:
                fn = rec[filename_key]
                text = rec[text_key]

                if fn not in spans_by_ann[a] or fn not in spans_by_ann[b]:
                    continue  # only docs both annotated (for this category)

                labels_a = spans_to_token_labels(text, spans_by_ann[a][fn], inclusive_end=inclusive_end)
                labels_b = spans_to_token_labels(text, spans_by_ann[b][fn], inclusive_end=inclusive_end)

                y_a.extend(labels_a)
                y_b.extend(labels_b)

            kappa = float("nan") if len(y_a) == 0 else cohen_kappa_score(y_a, y_b)
            pairwise_kappa[(a, b)] = kappa
            token_counts[(a, b)] = len(y_a)

        valid = [k for k in pairwise_kappa.values() if k == k]  # drop NaNs
        mean_kappa = sum(valid) / len(valid) if valid else float("nan")

        results[category] = {
            "pairwise_kappa": pairwise_kappa,
            "mean_kappa": mean_kappa,
            "token_counts": token_counts,
            "num_pairs": len(pairwise_kappa),
        }

    return results


In [20]:
categories = ["Unsupported claim", "Format", "Coherence", "Lacks synthesis"]

res_by_cat = mean_pairwise_cohens_kappa_by_category(
    texts,
    total_anns,
    categories=categories,
    inclusive_end=False,   # set True if your end offsets are inclusive
)

for cat in categories:
    print(cat, "mean κ =", round(res_by_cat[cat]["mean_kappa"], 4))
    for pair, k in res_by_cat[cat]["pairwise_kappa"].items():
        print(" ", pair, round(k, 4), "tokens:", res_by_cat[cat]["token_counts"][pair])


Unsupported claim mean κ = 0.4042
  ('Ed', 'Ekaterina') 0.3129 tokens: 3668
  ('Ed', 'Iman') 0.3825 tokens: 4292
  ('Ed', 'Iraa') 0.4832 tokens: 3879
  ('Ekaterina', 'Iman') 0.3951 tokens: 3995
  ('Ekaterina', 'Iraa') 0.3281 tokens: 3492
  ('Iman', 'Iraa') 0.5235 tokens: 3492
Format mean κ = 0.8345
  ('Ed', 'Ekaterina') 0.8552 tokens: 2044
  ('Ed', 'Iman') 0.8875 tokens: 1432
  ('Ed', 'Iraa') 0.7971 tokens: 2044
  ('Ekaterina', 'Iman') 0.7973 tokens: 1432
  ('Ekaterina', 'Iraa') 0.8206 tokens: 2044
  ('Iman', 'Iraa') 0.8496 tokens: 1759
Coherence mean κ = 0.3254
  ('Ed', 'Ekaterina') -0.0391 tokens: 1712
  ('Ed', 'Iman') 0.6962 tokens: 1385
  ('Ed', 'Iraa') 0.441 tokens: 1712
  ('Ekaterina', 'Iman') 0.1045 tokens: 1561
  ('Ekaterina', 'Iraa') 0.183 tokens: 1888
  ('Iman', 'Iraa') 0.5669 tokens: 1561
Lacks synthesis mean κ = 0.4305
  ('Ed', 'Ekaterina') 0.271 tokens: 998
  ('Ed', 'Iman') 0.488 tokens: 1979
  ('Ed', 'Iraa') 0.5421 tokens: 1979
  ('Ekaterina', 'Iman') 0.6693 tokens: 1610
