In [1]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting numpy (from imagehash)
  Downloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m115.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, imagehash
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4
[31mERROR: pip's dependenc

In [8]:
import pandas as pd
import imagehash
from PIL import Image
import aiohttp
import asyncio
from io import BytesIO
from tqdm.notebook import tqdm
import boto3
import gc
import json
from botocore.exceptions import ClientError

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('hash_store')

df = pd.read_csv('image_data_b1.csv')
df = df.dropna(subset=['small_image'])

base_url = "https://d1it09c4puycyh.cloudfront.net"
dimensions = "707x1000"

async def fetch_image(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            img_data = await response.read()
            return Image.open(BytesIO(img_data)).convert('RGB')
        else:
            raise IOError(f"Failed to fetch image from {url}, status code: {response.status}")

def calculate_phash(img):
    long_side = max(img.size)
    ratio = 512 / long_side
    new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
    img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    new_img = Image.new('RGB', (512, 512), (255, 255, 255))
    paste_pos = ((512 - new_size[0]) // 2, (512 - new_size[1]) // 2)
    new_img.paste(img, paste_pos)
    
    return imagehash.phash(new_img)

async def process_batch(batch_df, batch_number):
    async with aiohttp.ClientSession() as session:
        failed_rows = []
        for index, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing Batch {batch_number}"):
            image_url = f"{base_url}/{dimensions}/catalog/product{row['small_image'].strip()}"
            try:
                img = await fetch_image(session, image_url)
                phash = calculate_phash(img)
                
                item = {
                    'entity_id': row['entity_id'],
                    'sku': row['sku'],
                    'small_image': row['small_image'],
                    'phash': str(phash)
                }
                try:
                    table.put_item(Item=item)  # Insert into DynamoDB
                except ClientError as e:
                    print(f"Failed to insert item into DynamoDB: {e}")
                    failed_rows.append({'entity_id': row['entity_id']})

            except Exception as e:
                print(f"Failed to process image for SKU {row['sku']}: {e}")
                failed_rows.append({'entity_id': row['entity_id']})

        print(f"Batch {batch_number} processing complete.")
        if failed_rows:
            print(f"Failed to process {len(failed_rows)} items.")
        
        gc.collect()  # Freeing up memory

async def process_all_batches(start_batch=1, end_batch=None):
    batch_size = 100
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    if end_batch is None:
        end_batch = num_batches

    for i in range(start_batch - 1, end_batch):
        batch_df = df.iloc[i * batch_size:(i + 1) * batch_size]
        await process_batch(batch_df, i + 1)
        gc.collect()

In [9]:
await process_all_batches(start_batch=1)

Processing Batch 1:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 1 processing complete.


Processing Batch 2:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 2 processing complete.


Processing Batch 3:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 3 processing complete.


Processing Batch 4:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 4 processing complete.


Processing Batch 5:   0%|          | 0/100 [00:00<?, ?it/s]

Batch 5 processing complete.


Processing Batch 6:   0%|          | 0/100 [00:00<?, ?it/s]

CancelledError: 