In [38]:
path = "datasets/reverb45k_change/reverb45k_valid"

In [42]:
import json

with open(path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]


In [71]:
from typing import List, Tuple

def load_reverb45k(path: str = "reverb45k.json") -> Tuple[List[str], List[int], List[str]]:
    import json

    print("Loading ReVerb45k dataset...")
    with open(path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f if line.strip()]

    documents = []
    labels = []
    label_mapping = {}
    current_label_id = 0

    # Step 1: Collect all the labels
    for entry in data:
        triple = entry.get("triple", [])
        sentence = " ".join(triple).strip()
        if not sentence:
            continue

        wiki_object = entry.get("entity_linking", {}).get("object", "")
        if not wiki_object:
            continue

        if wiki_object not in label_mapping:
            label_mapping[wiki_object] = current_label_id
            current_label_id += 1

        label = label_mapping[wiki_object]
        documents.append(sentence)
        labels.append(label)

    # Step 2: Filter out single occurrence labels (clusters with only one item)
    label_counts = Counter(labels)
    filtered_docs = [doc for doc, lbl in zip(documents, labels) if label_counts[lbl] > 1]
    filtered_labels = [lbl for lbl in labels if label_counts[lbl] > 1]

    print(f"Loaded {len(filtered_docs)} triples from ReVerb45k after filtering out singleton clusters.")
    print(f"Found {len(set(filtered_labels))} unique object entities (clusters): "
          f"{list(set(filtered_labels))[:10] + ['...'] if len(set(filtered_labels)) > 10 else list(set(filtered_labels))}")

    return filtered_docs, filtered_labels, filtered_docs


In [72]:
raw_data, labels_list, documents = load_reverb45k(path)

Loading ReVerb45k dataset...
Loaded 4149 triples from ReVerb45k after filtering out singleton clusters.
Found 845 unique object entities (clusters): [2049, 2, 4, 6, 2059, 12, 13, 2060, 16, 17, '...']


In [61]:
labels = labels_list
from collections import Counter

label_counts = Counter(labels)
filtered_docs = [doc for doc, lbl in zip(documents, labels) if label_counts[lbl] > 1]
filtered_labels = [lbl for lbl in labels if label_counts[lbl] > 1]


In [64]:
len(set(filtered_labels))

845

In [73]:
import pandas as pd

df = pd.read_csv("clustering_metrics_results.csv")

In [79]:
df[df["Dataset"]=="bank77"]

Unnamed: 0,Dataset,Method,Status,Accuracy,Precision,Recall,Macro_F1,Micro_F1,NMI,ARI,pairwise_precision,pairwise_recall,pairwise_f1
11,bank77,LLM Correction,Success,0.628571,0.637119,0.628571,0.60668,0.628571,0.810173,0.529171,,,
12,bank77,LLM Correction,Success,0.628571,0.637007,0.628571,0.606594,0.628571,0.810213,0.529106,,,
42,bank77,Naive KMeans,Success,0.627597,0.635778,0.627597,0.605919,0.627597,0.809713,0.528287,,,
43,bank77,LLM Correction,Success,0.628896,0.637371,0.628896,0.607003,0.628896,0.810268,0.529282,,,
44,bank77,Keyphrase Expansion - concatenated,Success,0.656818,0.65645,0.656818,0.632364,0.656818,0.834722,0.566615,,,
45,bank77,Keyphrase Expansion - average,Success,0.613961,0.597374,0.613961,0.583212,0.613961,0.819189,0.531569,,,
46,bank77,Keyphrase Expansion - weighted_0.1,Success,0.638961,0.65281,0.638961,0.615033,0.638961,0.822455,0.545906,,,
47,bank77,Keyphrase Expansion - weighted_0.2,Success,0.625325,0.612491,0.625325,0.592283,0.625325,0.824682,0.550801,,,
48,bank77,Keyphrase Expansion - weighted_0.3,Success,0.653571,0.648261,0.653571,0.631797,0.653571,0.829342,0.560577,,,
49,bank77,Keyphrase Expansion - weighted_0.4,Success,0.620455,0.622782,0.620455,0.592704,0.620455,0.825809,0.533389,,,


In [None]:
df