In [1]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting numpy (from imagehash)
  Downloading numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, imagehash
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4
[31mERROR: pip's dependenc

In [7]:
import pandas as pd
import imagehash
from PIL import Image
import aiohttp
import asyncio
from io import BytesIO
from tqdm.asyncio import tqdm as async_tqdm
from tqdm.notebook import tqdm
import gc
import random
from collections import defaultdict
import itertools

# reading the csv containing the image data
df = pd.read_csv('image_data.csv')
df = df.dropna(subset=['small_image'])

# Randomly select 100 images
sample_size = 100
sample_df = df.sample(n=sample_size, random_state=42)

base_url = "https://d1it09c4puycyh.cloudfront.net"
resolutions = ["83x110", "128x128", "355x503", "500x500", "1000x1000", "920x1300"]

async def fetch_image(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            img_data = await response.read()
            return Image.open(BytesIO(img_data)).convert('RGB')
        else:
            raise IOError(f"Failed to fetch image from {url}, status code: {response.status}")

def calculate_phash(img):
    # Determine the longer side
    long_side = max(img.size)
    # Calculate the ratio to resize to 512 on the longer side
    ratio = 512 / long_side
    # Calculate new dimensions maintaining aspect ratio
    new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
    # Resize the image maintaining aspect ratio
    img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    # Create a new 512x512 white image
    new_img = Image.new('RGB', (512, 512), (255, 255, 255))
    # Calculate position to paste the resized image
    paste_pos = ((512 - new_size[0]) // 2, (512 - new_size[1]) // 2)
    # Paste the resized image onto the white canvas
    new_img.paste(img, paste_pos)
    
    # Calculate perceptual hash
    return imagehash.phash(new_img)

async def process_image(session, row, pbar):
    results = {}
    for resolution in resolutions:
        image_url = f"{base_url}/{resolution}/catalog/product{row['small_image'].strip()}"
        try:
            img = await fetch_image(session, image_url)
            phash = calculate_phash(img)
            results[resolution] = str(phash)
        except Exception as e:
            print(f"Failed to process image at {image_url}: {e}")
            results[resolution] = None
    pbar.update(1)
    return results

async def process_sample():
    async with aiohttp.ClientSession() as session:
        pbar = tqdm(total=len(sample_df), desc="Processing Images")
        tasks = [process_image(session, row, pbar) for _, row in sample_df.iterrows()]
        results = await async_tqdm.gather(*tasks)
        pbar.close()
    return results

# Run the async function
results = await process_sample()

Processing Images:   0%|          | 0/100 [00:00<?, ?it/s]


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:15<25:45, 15.61s/it][A
 62%|██████▏   | 62/100 [00:16<00:07,  5.30it/s][A
 86%|████████▌ | 86/100 [00:16<00:01,  8.07it/s][A
100%|██████████| 100/100 [00:17<00:00,  5.88it/s][A


In [8]:
# Analyze results
similarity_data = []
for i, result in tqdm(enumerate(results), total=len(results), desc="Analyzing Results"):
    for res1 in resolutions:
        for res2 in resolutions:
            if result[res1] and result[res2]:
                hash1 = imagehash.hex_to_hash(result[res1])
                hash2 = imagehash.hex_to_hash(result[res2])
                similarity = 1 - (hash1 - hash2) / 64.0  # 64 bits in the hash
                similarity_data.append({
                    'image_index': i,
                    'resolution1': res1,
                    'resolution2': res2,
                    'similarity': similarity
                })

similarity_df = pd.DataFrame(similarity_data)

# Create a pivot table for easier visualization of the comparison matrix
pivot_df = similarity_df.pivot_table(values='similarity', 
                                     index='resolution1', 
                                     columns='resolution2', 
                                     aggfunc='mean')

# Print the comparison matrix
print(pivot_df)

# Optionally, save results to CSV
similarity_df.to_csv('hash_similarity_results_detailed.csv', index=False)
pivot_df.to_csv('hash_similarity_results_matrix.csv')

Analyzing Results:   0%|          | 0/100 [00:00<?, ?it/s]

resolution2  1000x1000   128x128   355x503   500x500    83x110  920x1300
resolution1                                                             
1000x1000     1.000000  0.982812  0.609062  0.992188  0.649375  0.610938
128x128       0.982812  1.000000  0.606250  0.983437  0.646563  0.607812
355x503       0.609062  0.606250  1.000000  0.610938  0.896875  0.982500
500x500       0.992188  0.983437  0.610938  1.000000  0.651563  0.612812
83x110        0.649375  0.646563  0.896875  0.651563  1.000000  0.904062
920x1300      0.610938  0.607812  0.982500  0.612812  0.904062  1.000000
