In [None]:
import json
import os
import sys
import time

import h5py
import torch

In [None]:
def load_h5py_files_to_tensor(directory):
    # List and sort files by index
    files = sorted(
        [f for f in os.listdir(directory) if f.startswith("embeddings_") and f.endswith(".h5")],
        key=lambda x: int(x.split("_")[1].split(".")[0]),
    )

    # Initialize an empty list to store the data
    data_list = []

    # Loop through each file in order, load the data using h5py, and append to the list
    for file in files:
        file_path = os.path.join(directory, file)
        with h5py.File(file_path, "r") as f:
            # Assuming the datasets are stored under numerical keys
            for key in sorted(f.keys(), key=int):
                data = f[key][:]
                data_list.append(torch.tensor(data))

    # Combine the list of tensors into a single tensor
    combined_tensor = torch.stack(data_list)

    return combined_tensor

In [None]:
path_1 = "/project/Deep-Clustering/res/20240723_170228_flickr30k-preextracted/epoch_0"
path_2 = "/project/Deep-Clustering/res/20240723_165510_flickr30k-preextracted/epoch_0_kmupdate"

label_embeddings = load_h5py_files_to_tensor(path_1)
label_embeddings_kmupdate = load_h5py_files_to_tensor(path_2)

In [None]:
differences = torch.any(label_embeddings != label_embeddings_kmupdate, dim=1)

# Count the number of different rows
num_different_rows = torch.sum(differences).item()

print(num_different_rows)

In [None]:
# Find different embeddings in the two sets
diff = label_embeddings - label_embeddings_kmupdate
print(diff.abs().sum())