In [None]:
import csv
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
from scipy.spatial.distance import pdist, squareform
from openai import OpenAI
import json
import time
from collections import Counter
import random
import math
import re
import pandas as pd
from scipy.optimize import linear_sum_assignment
import ast
import networkx as nx
import os
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Optional
import builtins
from lshashpy3 import LSHash
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

In [66]:
os.environ["http_proxy"] = "http://localhost:7890"
os.environ["https_proxy"] = "http://localhost:7890"

client = OpenAI(
    api_key="your api key "
)

pre_prompt = ("Please classify the following records into a two-dimensional list. Each element of the array "
              "should be a group, containing the record IDs of that group (e.g., 1, 2, 3, etc.). Ensure that "
              "each record ID is classified exactly once and appear once in the 2D array, without any "
              "duplication or omission.The output should be a two-dimensional list with no additional information!\n")

In [None]:


def normalize_and_clean(text):
    if not isinstance(text, str) or not text.strip():
        return ''  
    text = text.lower()  
    text = re.sub(r'[^a-z0-9\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

def read_csv_and_clean_with_ids(file_path):
    df = pd.read_csv(file_path)
    if df.empty or df.shape[1] < 2:
        raise ValueError("CSV is none ")
    ids = df.iloc[:, 0]

    data = df.iloc[:, 1:]

    data.replace("", np.nan, inplace=True)
    data.fillna("", inplace=True)
    cleaned_data = []
    for id_, row in zip(ids, data.values):
        cleaned_row = [normalize_and_clean(str(item)) for item in row if normalize_and_clean(str(item))]
        cleaned_data.append((int(id_), cleaned_row))  

    return cleaned_data

def build_inverted_index(cleaned_data):
    inverted_index = defaultdict(set)  
    for record_id, fields in cleaned_data:
        for field in fields:
            if field: 
                inverted_index[field].add(record_id)
    return inverted_index

def calculate_similarity_matrix(cleaned_data, inverted_index):
    record_ids = [record_id for record_id, _ in cleaned_data]
    record_sets = {record_id: set(fields) for record_id, fields in cleaned_data}
    n = len(record_ids)
    similarity_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                similarity_matrix[i][j] = 1.0  
            else:
                set_i = record_sets[record_ids[i]]
                set_j = record_sets[record_ids[j]]

                intersection = len(set_i & set_j)
                union = len(set_i | set_j)

                similarity_matrix[i][j] = intersection / union if union > 0 else 0.0

    return similarity_matrix, record_ids

def canopy_clustering(similarity_matrix, T1, T2):
    
    if T1 <= T2:
        raise ValueError("T1 must larger than T2")
    if similarity_matrix.shape[0] != similarity_matrix.shape[1]:
        raise ValueError("wrong!")


    n = similarity_matrix.shape[0]
    unassigned_points = set(range(n))  
    canopies = []  
    block_T1 = []  

    while unassigned_points:
        center = unassigned_points.pop()
        candidate_points = [
            idx for idx in unassigned_points if similarity_matrix[center, idx] >= T2
        ]
        candidate_points.append(center)  
        current_canopy = set(candidate_points)
        strictly_removed = [
            idx for idx in candidate_points if similarity_matrix[center, idx] >= T1
        ]
        tmp = []
        for idx in strictly_removed:
            unassigned_points.discard(idx) 
            tmp.append(idx)
            current_canopy.discard(idx)  
            
        block_T1.append(tmp)
        canopies.append(list(current_canopy))

    return canopies, block_T1



In [40]:

def get_ground_truth(file_path):
    class UnionFind:
            def __init__(self):
                self.parent = {}

            def find(self, x):
                if self.parent[x] != x:
                    self.parent[x] = self.find(self.parent[x])
                return self.parent[x]

            def union(self, x, y):
                rootX = self.find(x)
                rootY = self.find(y)
                if rootX != rootY:
                    self.parent[rootY] = rootX

            def add(self, x):
                if x not in self.parent:
                    self.parent[x] = x

    def merge_coordinates(coordinates):
            uf = UnionFind()
            ids = set()

            for ltable_id, rtable_id in coordinates:
                uf.add(ltable_id)
                uf.add(rtable_id)
                uf.union(ltable_id, rtable_id)
                ids.add(ltable_id)
                ids.add(rtable_id)

            entity_groups = {}
            for _id in ids:
                root = uf.find(_id)
                if root not in entity_groups:
                    entity_groups[root] = []
                entity_groups[root].append(_id)

            result_1 = []
            for root, records in entity_groups.items():
                result_1.append(records)

            return result_1
    data = []
    with open(file_path, newline='', encoding='MacRoman') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)  
        for row in reader:
            data.append(row)

    ground_truth_1 = merge_coordinates(data)
    print(len(ground_truth_1))
    ground_truth_new = []
    for row in ground_truth_1:
        tmp = []
        for ids in row:
            tmp.append(int(ids))
        ground_truth_new.append(tmp)
    return ground_truth_new


In [41]:
def get_data(id_list, file_path):
    lines = []
    with open(file_path, 'r', encoding='MacRoman') as file:
        reader = csv.DictReader(file)
        rows = list(reader)
        for r_id in id_list:
            for row in rows:
                if row['ID'] == str(r_id): 
                    lines.append(','.join([str(row[key]) for key in reader.fieldnames if key != 'ID']))
                    break
    prompts = '\n'.join(lines)
    return prompts

def vectorize_data(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(text.split('\n')) 
    return embeddings

def elbow_method(embeddings, max_k=5):

    if embeddings is None or embeddings.shape[0] < 2:
        return 1  
    
    distortions = []
    K = range(1, min(max_k, embeddings.shape[0]) + 1)
    
    for k in K:
        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
        kmeans.fit(embeddings)
        distortions.append(kmeans.inertia_)
    
    optimal_k_index = np.argmin(distortions[1:]) + 1  
    optimal_k = K[optimal_k_index]

    print(f"best is : {optimal_k}")
    return optimal_k

def kmeans_clustering(embeddings, n_clusters):

    if embeddings is None or len(embeddings) < n_clusters:
        return np.zeros(len(embeddings), dtype=int)  
    
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42,n_init=10)
    kmeans.fit(embeddings)
    return kmeans.labels_

def format_output(id_list, labels):
    clusters = {}
    for i, label in enumerate(labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(id_list[i])
    
    sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)
    
    output = []
    for cluster in sorted_clusters:
        output.append([int(item) for item in cluster[1]])
    
    return output

def read_csv_to_2d_array(file_path):
    with open(file_path, 'r', encoding='MacRoman') as file:
        reader = csv.reader(file)
        data = list(reader)
    return data

def get_prompt_from_ids(id_list, file_path):
    lines = []
    with open(file_path, 'r', encoding='MacRoman') as file:
        reader = csv.DictReader(file)
        rows = list(reader)
        for r_id in id_list:
            for row in rows:
                if row['ID'] == str(r_id): 
                    rec_str = f"Record {r_id}: "
                    rec_str += ','.join([str(row[key]) for key in reader.fieldnames if key != 'ID'])
                    lines.append(rec_str)
                    break
    return '\n'.join(lines)



In [42]:
def dynamic_sampling(original_array):
    result = []
    row_indices = {i: row.copy() for i, row in enumerate(original_array)}
    total_ids = sum(len(row) for row in original_array)
    
    while True:
        group = []
        while len(group) < 10:
            flag = False
            for i in range(len(original_array)):
                if row_indices[i]:  
                    group.append(row_indices[i].pop(0))
                    flag = True
                if len(group) == 10: 
                    break
            if not flag: 
                break
        if group:
            result.append(group)
        if not any(row_indices.values()):  
            break
    output_ids = [id for group in result for id in group]
    if len(output_ids) != total_ids:
        raise ValueError("ID is not match ！")
    
    return result


In [43]:

def process_sampled_ids(csv_file,sample_ids_list):

    execution_time=0
    use_number = 0
    total_tokens_call = 0
    
    all_classified_results = []
    for ids in sample_ids_list:
        content_prompt = get_prompt_from_ids(id_list = ids, file_path = csv_file)
        start_time = time.time()
        completion = client.chat.completions.create(
                            model = "gpt-4o-mini",
                            messages=[
                            {"role": "system", "content": "You are a worker with rich experience performing Entity Resolution tasks. You specialize in clustering and classification within ER."},
                            {"role": "user", "content": pre_prompt + content_prompt},
                            ]
                        )
        execution_time += (time.time() - start_time)
        use_number += 1
        token_number = completion.usage.total_tokens
        total_tokens_call += token_number
        content = completion.choices[0].message.content
        content = content.replace('\n', '').replace(' ', '')
        content_cleaned = re.sub(r"[^\d\[\],]", "", content)
        content_cleaned = re.sub(r",\s*]", "]", content_cleaned)
        content_cleaned = re.sub(r",+", ",", content_cleaned)
        matches = re.findall(r'\[([^\[\]]*?)\]', content_cleaned)
        result_llm = []
        for match in matches:
            match_cleaned = match.strip()
            if ',' in match_cleaned:
                sublist = [int(num) for num in match_cleaned.split(',')]
                result_llm.append(sublist)
            else:
                result_llm.append([int(num) for num in match_cleaned.split()])
        all_classified_results.append(result_llm) 
    return all_classified_results,execution_time,use_number,total_tokens_call
        

In [44]:

def the_most_importent_one(vector_data,classified_results):

    result_for_find = []
    for classified_results_row in classified_results:
        list_select = []
        vectored_select = []
        for cluster_row in classified_results_row:
            vectored_select = np.array([vector_data[id_] for id_ in cluster_row])
            avg_vector = np.mean(vectored_select, axis=0) 
            distances = [np.linalg.norm(vectored_select[i] - avg_vector) for i in range(len(cluster_row))]
            representative_id = cluster_row[np.argmin(distances)]
            list_select.append(representative_id)
        result_for_find.append(list_select)
    return result_for_find


In [45]:


def read_2d_array_from_file(file_path):

    array_list = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                row = list(map(int, line.strip().split()))
                array_list.append(row)
        return array_list
    except FileNotFoundError:
        print(f" {file_path}  is not found")
    except ValueError as e:
        print(f"can not found：{e}")
    return []



In [46]:

def the_most_importent_one_1(classified_results):

    result_for_find = []
    for classified_results_row in classified_results:
        list_select = []
        for cluster_row in classified_results_row:   
            representative_id = cluster_row[0]
            list_select.append(representative_id)
        result_for_find.append(list_select)
    return result_for_find

In [47]:
from typing import List, Optional

def find_most_similar(
    current_id: Optional[int], 
    candidate_ids: List[int], 
    similarity_matrix: List[List[float]]
) -> Optional[int]:
   
    if not candidate_ids:
        return None
    
    if current_id is None:
        return candidate_ids[0]

    max_similarity = -float('inf')
    most_similar_id = None
    
    for candidate_id in candidate_ids:
        similarity = similarity_matrix[current_id][candidate_id]
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_id = candidate_id
    
    return most_similar_id

def filter_available_ids(next_row_ids: List[int], id_assigned: set) -> List[int]:
    
    return [id_ for id_ in next_row_ids if id_ not in id_assigned]

def process_rounds(
    id_matrix: List[List[int]],
    similarity_matrix: List[List[float]],
    max_length: int
) -> List[List[int]]:
    
    all_rounds = []
    id_assigned = set()
    total_rows = len(id_matrix)

    while True:
        current_round = []
        current_id = None

        for row_index in range(total_rows):
            available_ids = filter_available_ids(id_matrix[row_index], id_assigned)
            if not available_ids:
                continue

            next_id = find_most_similar(current_id, available_ids, similarity_matrix)
            if next_id is not None:
                current_round.append(next_id)
                id_assigned.add(next_id)
                current_id = next_id

       

        if not current_round:
            break
        
        all_rounds.append(current_round)
    
    return all_rounds



def traverse_ids_to_2d(
    id_matrix: List[List[int]], 
    similarity_matrix: List[List[float]],
    max_length: int = 10, 
    batch_size: int = 10
) -> List[List[int]]:

    all_rounds = process_rounds(id_matrix, similarity_matrix, max_length)
    return all_rounds

In [48]:
def merge_find(x, pa):
    if pa[x] != x:
        pa[x] = merge_find(pa[x], pa)  
    return pa[x]


def merge_union(x, y, pa):
    rootX = merge_find(x, pa)
    rootY = merge_find(y, pa)
    if rootX != rootY:
        pa[rootY] = rootX  


def find_simi_nex(small_clusters, now_cluster , pa , ini_simi,the_max_nex):
    global i1, i2

    pattern = []  
    for i in range(len(now_cluster) - 1):
        for j in range(i + 1, len(now_cluster)):
            if ini_simi[now_cluster[i]][now_cluster[j]] >= the_max_nex:
                # print(maper[now_cluster[i]], maper[now_cluster[j]])
                # print([now_cluster[i], now_cluster[j]])
                pattern.append([now_cluster[i], now_cluster[j]])
    for x, y in pattern:
        for i in range(len(small_clusters)):
            if x in small_clusters[i]:
                i1 = i
                break
        for i in range(len(small_clusters)):
            if y in small_clusters[i]:
                i2 = i
                break
        merge_union(i1, i2, pa)
    merged_groups = {}
    for i in range(len(small_clusters)):
        root = merge_find(i, pa)
        if root not in merged_groups:
            merged_groups[root] = []
        merged_groups[root].extend(small_clusters[i])
    result = [sorted(set(values)) for values in merged_groups.values()]
    return result


In [49]:

def llm_seperate(data_list , data_file,ini_simi,the_max_nex):
    api_call_time = 0
    use_time = 0
    use_token = 0
    seperate_input_token = 0
    seperate_output_token = 0
    result_sliced = []
    number = math.ceil(len(data_list)/10)
    sliced_lists = [data_list[i * 10:(i + 1) * 10] for i in range(number)]
    for one_slice in sliced_lists:
        api_call_time+=1
        start_time = time.time()
        prompt_sliced = get_prompt_from_ids(one_slice, data_file)
        completion = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role": "system",
                            "content": "You are a worker specialize in clustering and classification within Entity Resolution."},
                            {"role": "user", "content": pre_prompt + prompt_sliced},
                        ]
                    )
        use_time += time.time() - start_time
        prompt_tokens = completion.usage.prompt_tokens  
        seperate_input_token += prompt_tokens
        completion_tokens = completion.usage.completion_tokens  
        seperate_output_token += completion_tokens
        token_number = completion.usage.total_tokens
        use_token += token_number
        content = completion.choices[0].message.content
        content = content.replace('\n', '').replace(' ', '')
        content_cleaned = re.sub(r"[^\d\[\],]", "", content)
        content_cleaned = re.sub(r",\s*]", "]", content_cleaned)
        content_cleaned = re.sub(r",+", ",", content_cleaned)
        matches = re.findall(r'\[([^\[\]]*?)\]', content_cleaned)
        result_tmp = []
        for match in matches:
            match_cleaned = match.strip()
            if ',' in match_cleaned:
                sublist = [int(num) for num in match_cleaned.split(',')]
                result_tmp.append(sublist)
            else:
                 result_tmp.append([int(num) for num in match_cleaned.split()])
        for row_slice in result_tmp:
            result_sliced.append(row_slice)
    parent = list(range(len(result_sliced)))
    array_new = find_simi_nex(result_sliced,data_list,parent,ini_simi,the_max_nex)
    return array_new,api_call_time,use_time,use_token , seperate_input_token , seperate_output_token



In [50]:

def find_back(two_d_array, three_d_array):
    
    num_to_row = {}

    
    for matrix in three_d_array:
        for row in matrix:
            for number in row:
                if number not in num_to_row:
                    num_to_row[number] = row

 
    for i, row in enumerate(two_d_array):
        new_row = []
        for number in row:
            if number in num_to_row:
                new_row.extend(num_to_row[number])
   
        two_d_array[i] = list(dict.fromkeys(new_row))

    return two_d_array

In [51]:

def cal_total_simi_vector(data_file_path,model_file):
    model = SentenceTransformer(model_file)
    def combine_attributes(row):
        return ' '.join(str(value) for value in row[1:])  
    data = pd.read_csv(data_file_path,encoding="MacRoman")
    data['combined_text'] = data.apply(combine_attributes, axis=1)
    vectors = data['combined_text'].apply(lambda text: model.encode(text)).tolist()
    simi_matrix = cosine_similarity(vectors)
    print("calculate similarity matrix done")
    return vectors,simi_matrix,data

In [52]:


def bipartite_clustering(data, similarity_matrix):

    G = nx.Graph()
    

    for i in range(len(similarity_matrix)):
        G.add_node(i)

   
    for i in range(len(similarity_matrix)):
        for j in range(len(similarity_matrix)):
            if similarity_matrix[i][j] > 0:
                G.add_edge(i, j)


    node_partition = nx.bipartite.color(G)


    cluster1 = []
    cluster2 = []
    for i, color in node_partition.items():
        if color == 0:
            cluster1.append(data[i])
        else:
            cluster2.append(data[i])

    return [cluster1, cluster2]


In [53]:

import csv
from collections import Counter

def read_clusters_from_csv(filename):
    clusters = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            clusters.append([int(item) for item in row if item]) 
    return clusters

def calculate_purity(true_clusters, predicted_clusters):
    total_samples = sum(len(cluster) for cluster in predicted_clusters)
    total_correct = 0

    for pred_cluster in predicted_clusters:
        label_count = Counter()
        for sample in pred_cluster:
            for true_cluster in true_clusters:
                if sample in true_cluster:
                    label_count[tuple(true_cluster)] += 1
        if label_count:
            max_label_count = max(label_count.values())
            total_correct += max_label_count

    return total_correct / total_samples if total_samples > 0 else 0

def calculate_inverse_purity(true_clusters, predicted_clusters):
    total_samples = sum(len(cluster) for cluster in true_clusters)
    total_correct = 0

    for true_cluster in true_clusters:
        if true_cluster:
            pred_labels = Counter()
            for sample in true_cluster:
                for pred_cluster in predicted_clusters:
                    if sample in pred_cluster:
                        pred_labels[tuple(pred_cluster)] += 1
            if pred_labels:
                max_match = max(pred_labels.values())
                total_correct += max_match

    return total_correct / total_samples if total_samples > 0 else 0

def calculate_fp_measure(true_clusters, predicted_clusters):
    purity = calculate_purity(true_clusters, predicted_clusters)
    inverse_purity = calculate_inverse_purity(true_clusters, predicted_clusters)

    if purity + inverse_purity == 0:
        return 0

    return 2 * (purity * inverse_purity) / (purity + inverse_purity)

from scipy.optimize import linear_sum_assignment

def convert_to_labels(clusters, n_samples):

    labels = [-1] * n_samples 
    for cluster_id, cluster in enumerate(clusters):
        for sample in cluster:
            labels[sample] = cluster_id
    return labels

from sklearn.metrics import adjusted_rand_score
def calculate_ari(true_clusters, predicted_clusters):

    all_samples = set(sample for cluster in true_clusters for sample in cluster) | \
    set(sample for cluster in predicted_clusters for sample in cluster)
    n_samples = max(all_samples) + 1 

  
    true_labels = convert_to_labels(true_clusters, n_samples)
    predicted_labels = convert_to_labels(predicted_clusters, n_samples)

   
    ari = adjusted_rand_score(true_labels, predicted_labels)
    return ari

def calculate_accuracy(original_list, predicted_list):
 
    original_mapping = {}
    for sublist in original_list:
        group = tuple(sublist)  
        for item in sublist:
            original_mapping[item] = group

 
    total_predicted_count = sum(len(sublist) for sublist in predicted_list)

 
    correct_count = 0

 
    checked_items = set()

 
    for sublist in predicted_list:
        original_groups = set(original_mapping.get(item, None) for item in sublist)
        if len(original_groups) == 1 and None not in original_groups:
 
            correct_count += len(sublist)
        else:
 
            for item in sublist:
                if original_mapping.get(item, None) in original_groups:
                    if item not in checked_items: 
                        correct_count += 1
                checked_items.add(item)


    for sublist in predicted_list:
        original_groups = set(original_mapping.get(item, None) for item in sublist)
        if len(original_groups) == 1 and None not in original_groups:
 
            original_group = original_groups.pop()
            if len(sublist) < len(original_group):
        
                correct_count -= len(sublist)

    all_original_items = set(item for sublist in original_list for item in sublist)
    all_predicted_items = set(item for sublist in predicted_list for item in sublist)
    missed_items = all_original_items - all_predicted_items

    correct_count += len(missed_items)

    predicted_item_counts = {}
    for sublist in predicted_list:
        for item in sublist:
            if item in predicted_item_counts:
                predicted_item_counts[item] += 1
            else:
                predicted_item_counts[item] = 1

    for item, count in predicted_item_counts.items():
        if count > 1:
            correct_count -= (count - 1)  

    for sublist in predicted_list:
        for item in sublist:
            if item not in all_original_items:
                correct_count -= 1  

    accuracy = correct_count / total_predicted_count if total_predicted_count > 0 else 0.0
    return accuracy





In [54]:
def seperate(vectors,simi_matrix,merge_clusters_pre,data_file_path,the_max_nex):
    api_call_time_all = 0
    sperate_time = 0
    sperate_token = 0
    seperate_input = 0
    seperate_output = 0
    sperate_result = []
    for id_list in merge_clusters_pre:
        print(id_list)
        text_data = get_data(id_list, data_file_path)
        vectorized_data = vectorize_data(text_data)
        n_clusters = elbow_method(vectorized_data) 
        labels = kmeans_clustering(vectorized_data, n_clusters)
        clusters_labels = format_output(id_list, labels)
        prompt_id = dynamic_sampling(clusters_labels)
        classified_results, execute_time , use_number , total_tokens = process_sampled_ids(data_file_path, prompt_id)
        sperate_time+=execute_time
        api_call_time_all+=use_number
        sperate_token+=total_tokens

        result_for_found = the_most_importent_one(vectors,classified_results) 
        target_list = traverse_ids_to_2d(result_for_found, simi_matrix, max_length=10, batch_size=10) 
        # target_list = three_d_lists[0]
        llm_tmp = []
        for row_slice in target_list:
            print(row_slice)
            array_new,api_call_time,use_time,use_token , seperate_input_token , seperate_output_token = llm_seperate(row_slice,data_file_path,simi_matrix,the_max_nex)
            api_call_time_all +=api_call_time
            sperate_time += use_time
            sperate_token += use_token
            seperate_input += seperate_input_token
            seperate_output += seperate_output_token
            llm_tmp = llm_tmp + array_new
        find_back_matrix = find_back(llm_tmp,classified_results)
        sperate_result += find_back_matrix
    print("seperate done")
    return sperate_result, api_call_time_all ,sperate_time, sperate_token, seperate_input , seperate_output

use canopy block


In [72]:

file_path = './dataset/cora/'
data_file_path = file_path+'cora.csv'
gt_path = file_path+'gt.csv'
cleaned_data = read_csv_and_clean_with_ids(data_file_path)
inverted_index = build_inverted_index(cleaned_data)
ini_simi, record_ids = calculate_similarity_matrix(cleaned_data, inverted_index)
T1 = 0.27
T2 = 0.1
block_time = time.time()
canopies_block = canopy_clustering(ini_simi,  T1, T2)
execute_block_time = time.time() -  block_time
print("done")
print(f"execute_jaccard_block_time (s) : {execute_block_time}")
import warnings
warnings.filterwarnings('ignore') 
not_done_any = []
merge_done = []
for row_line in canopies_block: 
    if len(row_line)<=5:
        not_done_any.append(row_line)
    else:
        merge_done.append(row_line)
print(len(merge_done))
seperate_threshold = 0.1

sperate_result, api_call_time_all ,sperate_time, sperate_token, seperate_input_token , seperate_output_token = seperate_jac(ini_simi,merge_done,data_file_path,seperate_threshold)

for row in not_done_any:
    sperate_result.append(row)
print(f"api_call_time_seperate: {api_call_time_all}")
print(f"time_seperate (s): {sperate_time}")
print(f"token_seperate : {sperate_token}")
print(f"token_seperate_input : {seperate_input_token}")
print(f"token_seperate_output : {seperate_output_token}")
true_clusters = get_ground_truth(gt_path)
# true_clusters = read_2d_array_from_file(gt_path)
# predicted_clusters = merge_clusters_pre
predicted_clusters = sperate_result
# predicted_clusters = clusters_block
purity = calculate_purity(true_clusters, predicted_clusters)
inverse_purity = calculate_inverse_purity(true_clusters, predicted_clusters)
fp_measure = calculate_fp_measure(true_clusters, predicted_clusters)
acc = calculate_accuracy(true_clusters, predicted_clusters)
ari = calculate_ari(true_clusters, predicted_clusters)
print(f"FP-Measure: {fp_measure}")
print(f"ACC: {acc}")
print(f"ARI: {ari}")


done
execute_jaccard_block_time (s) : 0.6442053318023682
