# Using P-Hash with ScaNN

---



In [None]:
import os
import time
import numpy as np
from PIL import Image
import imagehash
import scann

from google.colab import drive
drive.mount('/content/drive')



def calculate_phash(image_path):
    try:
        image = Image.open(image_path)
        return np.array(imagehash.phash(image).hash.flatten(), dtype=np.float32)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None


def build_phash_dataset(dataset_folder):
    phash_list = []
    file_names = []
    for filename in os.listdir(dataset_folder):
        image_path = os.path.join(dataset_folder, filename)
        phash = calculate_phash(image_path)
        if phash is not None:
            phash_list.append(phash)
            file_names.append(filename)
    print(len(file_names));
    return np.array(phash_list), file_names

# # Function brute force (pHash)
# def brute_force_search(target_hash, dataset_hashes, file_names, top_n=5):
#     distances = np.linalg.norm(dataset_hashes - target_hash, axis=1)
#     top_indices = np.argsort(distances)[:top_n]
#     return [(file_names[i], distances[i]) for i in top_indices]

# Function to use ScaNN for fast nearest-neighbor search
def scann_search(target_hash, dataset_hashes, file_names):

    searcher = scann.scann_ops_pybind.builder(dataset_hashes, 10, "dot_product").tree(
        num_leaves=100, num_leaves_to_search=20, training_sample_size=361).score_ah(
        2, anisotropic_quantization_threshold=0.2).build()

    neighbors, distances = searcher.search(target_hash, final_num_neighbors=10)

    print(f"Neighbors: {neighbors}")
    print(f"Distances: {distances}")
    result=[]
    for i in range(len(neighbors)):
      result.append((file_names[neighbors[i]], distances[i]))

    print(result);
    return result

# Main function
if __name__ == "__main__":

    target_image = "/content/drive/My Drive/target_image1.jpeg"
    dataset_directory = "/content/drive/My Drive/Photos"

    if not os.path.exists(target_image):
        print(f"Target image {target_image} not found.")
        exit(1)


    print("Building pHash dataset...")
    start_time = time.time()
    dataset_hashes, file_names = build_phash_dataset(dataset_directory)

    if len(dataset_hashes) != len(file_names):
        print(f"Dataset size mismatch! dataset_hashes: {len(dataset_hashes)}, file_names: {len(file_names)}")
        exit(1)

    print(f"Dataset built in {time.time() - start_time:.2f} seconds.")

    target_hash = calculate_phash(target_image)
    if target_hash is None:
        print("Could not calculate pHash for the target image.")
        exit(1)


    # print("Brute-force search with pHash...")
    # start_time = time.time()
    # results_phash = brute_force_search(target_hash, dataset_hashes, file_names)
    # print(f"Brute-force search took {time.time() - start_time:.2f} seconds.")
    # print("Top matches using pHash:")
    # for match in results_phash:
    #     print(f"Image: {match[0]}, Distance: {match[1]}")


    print("ScaNN search...")
    start_time = time.time()
    results_scann = scann_search(target_hash, dataset_hashes, file_names)
    print(f"ScaNN search took {time.time() - start_time:.2f} seconds.")
    print("Top matches using ScaNN:")
    for match in results_scann:
        print(f"Image: {match[0]}, Distance: {match[1]}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Building pHash dataset...
357
Dataset built in 3.08 seconds.
ScaNN search...
Neighbors: [313  16 129 206 177 340 138 261 176 111]
Distances: [22.841244 22.566551 22.320303 21.390764 21.093891 21.074299 20.515936
 20.421228 20.352655 20.116205]
[('054412.jpg', 22.841244), ('000011.jpg', 22.566551), ('000170.jpg', 22.320303), ('054292.jpg', 21.390764), ('000050.jpg', 21.093891), ('054381.jpg', 21.074299), ('000174.jpg', 20.515936), ('054344.jpg', 20.421228), ('000171.jpg', 20.352655), ('000090.jpg', 20.116205)]
ScaNN search took 0.03 seconds.
Top matches using ScaNN:
Image: 054412.jpg, Distance: 22.841243743896484
Image: 000011.jpg, Distance: 22.566551208496094
Image: 000170.jpg, Distance: 22.320302963256836
Image: 054292.jpg, Distance: 21.390764236450195
Image: 000050.jpg, Distance: 21.093891143798828
Image: 054381.jpg, Distance: 21.074298858642578
Image: 0001