In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/features_uni2h.hdf5 /content

In [4]:
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm

hdf5_file = "features_uni2h.hdf5"

embeddings = []
metadata = []

with h5py.File(hdf5_file, "r") as f:
    embedding_index = 0
    for category_name in f.keys():
        category_group = f[category_name]
        for patient_name in tqdm(category_group.keys()):
            patient_group = category_group[patient_name]
            for embedding_name in patient_group.keys():
                embedding_data = patient_group[embedding_name][()]  # load vector as np array

                # Store the vector
                embeddings.append(embedding_data)

                # Store metadata
                metadata.append({
                    "index": embedding_index,
                    "category": category_name,
                    "patient": patient_name,
                    "embedding": embedding_name
                })

                embedding_index += 1

# Convert to numpy + dataframe
embeddings = np.array(embeddings)
metadata_df = pd.DataFrame(metadata)

print("Embeddings shape:", embeddings.shape)
print(metadata_df.head())

100%|██████████| 112/112 [01:11<00:00,  1.57it/s]
100%|██████████| 666/666 [07:29<00:00,  1.48it/s]


Embeddings shape: (2036270, 1536)
   index category                                            patient  \
0      0      MSI  TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-815...   
1      1      MSI  TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-815...   
2      2      MSI  TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-815...   
3      3      MSI  TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-815...   
4      4      MSI  TCGA-5M-AAT6-01Z-00-DX1.8834C952-14E3-4491-815...   

                                embedding  
0  TCGA-5M-AAT6-01Z-00-DX1_(100372,12166)  
1  TCGA-5M-AAT6-01Z-00-DX1_(100372,13180)  
2  TCGA-5M-AAT6-01Z-00-DX1_(100372,14194)  
3  TCGA-5M-AAT6-01Z-00-DX1_(100372,15207)  
4  TCGA-5M-AAT6-01Z-00-DX1_(100372,16221)  


In [5]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [6]:
import faiss

# Suppose we already have N patch embeddings (dim = D)
N, D = embeddings.shape  # number of embeddings, embedding dimension

# Normalize embeddings (important for cosine similarity)
print("Normalizing...")
faiss.normalize_L2(embeddings)

# Create FAISS index
nlist = 4096   # number of clusters
m = 64         # PQ parameter (sub-vector size)
nbits = 8

quantizer = faiss.IndexFlatIP(D)
index = faiss.IndexIVFPQ(quantizer, D, nlist, m, nbits)

# Train the index (needed for IVF/PQ)
# Training
index.train(embeddings)
index.add(embeddings)

# Save index
index_file = "uni2h_index.faiss"
faiss.write_index(index, index_file)

print("Index saved to", index_file)

Normalizing...
Index saved to uni2h_index.faiss


In [7]:
!mv uni2h_index.faiss /content/drive/MyDrive

In [10]:
metadata_df.to_parquet("patch_metadata.parquet", index=False)

In [11]:
!mv patch_metadata.parquet /content/drive/MyDrive