In [None]:
!pip install imagehash

In [18]:
import pandas as pd
import imagehash
from PIL import Image
import aiohttp
import asyncio
from io import BytesIO
from tqdm.notebook import tqdm
import gc
import json
import boto3
from botocore.exceptions import ClientError

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('LuluHashStore')

df = pd.read_csv('image_data.csv')
df = df.dropna(subset=['small_image'])

async def fetch_image(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            img_data = await response.read()
            return Image.open(BytesIO(img_data)).convert('RGB')
        else:
            raise IOError(f"Failed to fetch image from {url}, status code: {response.status}")
            
async def calculate_phash(session, image_url):
    try:
        img = await fetch_image(session, image_url)
        
        long_side = max(img.size)
        ratio = 512 / long_side
        new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
        img = img.resize(new_size, Image.Resampling.LANCZOS)
        
        new_img = Image.new('RGB', (512, 512), (255, 255, 255))
        paste_pos = ((512 - new_size[0]) // 2, (512 - new_size[1]) // 2)
        new_img.paste(img, paste_pos)
        
        # calculating phash
        hash_value = imagehash.phash(new_img)
        return str(hash_value)
    except Exception as e:
        print(f"Failed to process image at {image_url}: {e}")
        return None
    

base_url = "https://d1it09c4puycyh.cloudfront.net"
dimensions = "355x503" # most used dimensions (higher the dimensions, the better)

async def process_batch(batch_df, batch_number):
    async with aiohttp.ClientSession() as session:
        failed_rows = []

        for index, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing Batch {batch_number}"):
            image_url = f"{base_url}/{dimensions}/catalog/product{row['small_image'].strip()}"
            phash = await calculate_phash(session, image_url)
            if phash is not None:
                item = {
                    'entity_id': row['entity_id'],
                    'sku': row['sku'],
                    'small_image': row['small_image'],
                    'phash': phash
                }
                try:
                    table.put_item(Item=item)
                except ClientError as e:
                    print(f"Failed to insert item into DynamoDB: {e}")
                    failed_rows.append({'entity_id': row['entity_id']})

        print(f"Batch {batch_number} processing complete.")
        if failed_rows:
            print(f"Failed to process {len(failed_rows)} items.")

        gc.collect()  # freeing up memory

async def process_all_batches(start_batch=1, end_batch=None):
    batch_size = 100
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    if end_batch is None:
        end_batch = num_batches

    for i in range(start_batch - 1, end_batch):
        batch_df = df.iloc[i * batch_size:(i + 1) * batch_size]
        await process_batch(batch_df, i + 1)
        
        gc.collect()

In [None]:
await process_all_batches(start_batch=1, end_batch=1)