In [None]:
!pip install torch
!pip install torchvision

In [None]:
import pandas as pd
import torch
from torchvision import models, transforms
from PIL import Image
import aiohttp
import asyncio
from io import BytesIO
from tqdm.notebook import tqdm
import gc
import os
import json
import boto3
from botocore.exceptions import ClientError

# initializing dynamodb connection
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('LuluFeatureStore')

# reading the csv containing the image data
df = pd.read_csv('image_data.csv')
df = df.dropna(subset=['small_image'])


model = models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])  # removing final (classification) layer
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

async def fetch_image(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            img_data = await response.read()
            return Image.open(BytesIO(img_data)).convert('RGB')
        else:
            raise IOError(f"Failed to fetch image from {url}, status code: {response.status}")

async def extract_features(session, image_url):
    try:
        img = await fetch_image(session, image_url)
        img = preprocess(img).unsqueeze(0)
        with torch.no_grad():
            features = model(img).flatten().numpy()
        return features
    except Exception as e:
        print(f"Failed to process image at {image_url}: {e}")
        return None

base_url = "https://d1it09c4puycyh.cloudfront.net"
dimensions = "224x224" # dimensions accepted by resnet50

async def process_batch(batch_df, batch_number):
    async with aiohttp.ClientSession() as session:
        failed_rows = []

        for index, row in tqdm(batch_df.iterrows(), total=len(batch_df), desc=f"Processing Batch {batch_number}"):
            image_url = f"{base_url}/{dimensions}/catalog/product{row['small_image'].strip()}"
            features = await extract_features(session, image_url)
            if features is not None:
                item = {
                    'entity_id': row['entity_id'],
                    'sku': row['sku'],
                    'small_image': row['small_image'],
                    'features': json.dumps(features.tolist())  # serializing features as json string
                }
                try:
                    table.put_item(Item=item) # making a new entry in the table
                except ClientError as e:
                    print(f"Failed to insert item into DynamoDB: {e}")
                    failed_rows.append({'entity_id': row['entity_id']})

        print(f"Batch {batch_number} processing complete.")
        if failed_rows:
            print(f"Failed to process {len(failed_rows)} items.")

        gc.collect()  # freeing up memory

async def process_all_batches(start_batch=1, end_batch=None):
    batch_size = 100
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    if end_batch is None:
        end_batch = num_batches

    for i in range(start_batch - 1, end_batch):
        batch_df = df.iloc[i * batch_size:(i + 1) * batch_size]
        await process_batch(batch_df, i + 1)
        
        gc.collect()

In [None]:
await process_all_batches(start_batch=101, end_batch=200)