In [None]:
import datasets

# Load dataset
dataset = datasets.load_from_disk('./partial_data')

train_split = dataset['train']
train_split[0]

In [None]:
import json
import numpy as np
from tqdm.notebook import tqdm

def load_embeddings(file_path):
    embeddings = []
    original_indices = []
    with open(file_path, 'r') as file:
        for line in tqdm(file):
            data = json.loads(line)
            embeddings.append(data['embedding'])
            original_indices.append(data['index'])
    return np.array(embeddings).astype('float32'), original_indices

In [None]:
import faiss

def find_most_similar_pairs(embeddings, k=2, use_gpu=True):
    num_neighbors = k + 1
    dimension = embeddings.shape[1]


    cpu_index = faiss.IndexFlatL2(dimension)


    if use_gpu:
        
        gpu_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, cpu_index)
    else:
        gpu_index = cpu_index

    
    gpu_index.add(embeddings)

    
    distances, indices = gpu_index.search(embeddings, num_neighbors)

    return distances, indices

In [None]:
def extract_pairs(indices, original_indices, distances, num_pairs=50000):
    unique_pairs = set()
    for i in range(indices.shape[0]):
        for j in range(1, indices.shape[1]):
            if (original_indices[indices[i, j]], original_indices[i]) not in unique_pairs:
                unique_pairs.add((original_indices[i], original_indices[indices[i, j]]))

                # Break after collecting enough pairs
                if len(unique_pairs) == num_pairs:
                    return list(unique_pairs)
    return list(unique_pairs)

In [None]:
embeddings, original_indices = load_embeddings("image_embeddings.jsonl")

In [None]:
distances, indices = find_most_similar_pairs(embeddings, use_gpu=False)

In [None]:
similar_pairs = extract_pairs(indices, original_indices, distances, num_pairs=100000)

In [None]:
import pickle

with open('similar_pairs.pkl', 'wb') as file:
    pickle.dump(similar_pairs, file)