In [57]:
import json
import argparse
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
import numpy as np

In [2]:
def read_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [4]:
def find_ids_with_elements(dendrogram, gt_solution):
    id_map = {}

    def traverse(node, node_id):
        if isinstance(node["children"], list) and all(isinstance(child, dict) for child in node["children"]):
            for i, child in enumerate(node["children"]):
                traverse(child, node_id + [i])
        elif isinstance(node["children"], list) and all(isinstance(child, str) for child in node["children"]):  # Leaf node
            for element in node["children"]:
                if element in gt_solution:
                    id_map[element] = node_id

    traverse(dendrogram, [])
    return id_map

In [6]:
def analyze_solution_ids(gt_solution_ids):
    id_counter = defaultdict(list)

    for element, node_id in gt_solution_ids.items():
        id_counter[tuple(node_id)].append(element)

    frequency_analysis = {}
    for node_id, elements in id_counter.items():
        frequency_analysis[node_id] = len(elements)

    return frequency_analysis

In [13]:
gt_filename = ""
dendrogram_index = ""

In [14]:
gt_data = read_json(gt_filename)
dendrogram_data = read_json(dendrogram_index)

In [15]:
gt_solution = gt_data.get("gt_solution", [])

In [16]:
gt_solution_ids = find_ids_with_elements(dendrogram_data, gt_solution)

In [19]:
analysis = analyze_solution_ids(gt_solution_ids)
sorted_analysis = sorted(analysis.items(), key=lambda x: -x[1])

In [64]:
analysis

{(0, 0, 1, 1, 1, 0, 1, 0, 0): 1,
 (0, 0, 1, 1, 1, 1, 0, 0): 5,
 (1, 0, 1, 1, 1, 0, 0, 0): 11,
 (1, 0, 1, 1, 1, 0, 0, 1): 5,
 (1, 0, 1, 1, 1, 0, 1, 0): 29,
 (1, 0, 1, 1, 1, 0, 1, 1): 1,
 (1, 0, 1, 1, 1, 1, 0, 0, 0, 0): 15,
 (1, 0, 1, 1, 1, 1, 0, 0, 0, 1): 18,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0): 1,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0): 2,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0): 28,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0): 10,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1): 13,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1): 1,
 (1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0): 1,
 (1, 1, 0, 1, 0, 1, 0, 0, 0): 3,
 (1, 1, 0, 1, 0, 1, 0, 0, 1): 6,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0): 1,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1): 9,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0): 1,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0): 1,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0): 16,
 (1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1): 6,
 (1, 1, 0, 1, 0, 1, 0, 1, 1, 0): 5,
 (1, 1, 0, 1, 0, 1, 

In [20]:
sorted_analysis

[((1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0), 38),
 ((1, 0, 1, 1, 1, 0, 1, 0), 29),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0), 28),
 ((1, 0, 1, 1, 1, 1, 0, 0, 0, 1), 18),
 ((1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0), 16),
 ((1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0), 16),
 ((1, 0, 1, 1, 1, 1, 0, 0, 0, 0), 15),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1), 13),
 ((1, 0, 1, 1, 1, 0, 0, 0), 11),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0), 10),
 ((1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1), 9),
 ((1, 1, 0, 1, 0, 1, 0, 0, 1), 6),
 ((1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1), 6),
 ((0, 0, 1, 1, 1, 1, 0, 0), 5),
 ((1, 0, 1, 1, 1, 0, 0, 1), 5),
 ((1, 1, 0, 1, 0, 1, 0, 1, 1, 0), 5),
 ((1, 1, 0, 1, 0, 1, 0, 0, 0), 3),
 ((1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0), 3),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0), 2),
 ((0, 0, 1, 1, 1, 0, 1, 0, 0), 1),
 ((1, 0, 1, 1, 1, 0, 1, 1), 1),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0), 1),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1), 1),
 ((1, 0, 1, 1, 1, 1, 1, 1, 1,