In [13]:
import boto3
import pandas as pd
import base64
import json
import requests
from tqdm import tqdm

# Constants for AWS Bedrock
BEDROCK_MODEL_ID = "amazon.titan-embed-image-v1"
REGION = "us-east-1"

# Initialize AWS clients
bedrock_client = boto3.client(
    "bedrock-runtime", 
    REGION, 
    endpoint_url=f"https://bedrock-runtime.{REGION}.amazonaws.com"
)
dynamodb = boto3.resource('dynamodb', region_name=REGION)

def modify_image_url(original_url, resolution='448x448'):
    """
    Modify the image URL to use the specified resolution
    
    Args:
        original_url (str): Original image URL
        resolution (str): Desired image resolution (default: 448x448)
    
    Returns:
        str: Modified image URL
    """
    # Split the URL and replace the resolution part
    parts = original_url.split('/')
    try:
        # Find and replace the resolution segment
        resolution_index = next(i for i, part in enumerate(parts) if 'x' in part)
        parts[resolution_index] = resolution
        return '/'.join(parts)
    except StopIteration:
        # If no resolution found, append the resolution
        return original_url.replace('cloudfront.net/', f'cloudfront.net/{resolution}/')

def download_image(url):
    """
    Download image from URL with error handling
    
    Args:
        url (str): URL of the image to download
    
    Returns:
        bytes or None: Image data
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.content
    except requests.RequestException as e:
        print(f"Error downloading image from {url}: {e}")
        return None

def create_image_embedding(image):
    """
    Generate image embedding using Amazon Titan model
    
    Args:
        image (str): Base64 encoded image
    
    Returns:
        list: Image embedding
    """
    if image is None:
        return None
    
    image_input = {"inputImage": image}
    image_body = json.dumps(image_input)

    try:
        bedrock_response = bedrock_client.invoke_model(
            body=image_body,
            modelId=BEDROCK_MODEL_ID,
            accept="application/json",
            contentType="application/json"
        )

        final_response = json.loads(bedrock_response.get("body").read())
        
        embedding_error = final_response.get("message")

        if embedding_error is not None:
            print (f"Error creating embeddings: {embedding_error}")

        # Return embedding value
        return final_response.get("embedding")
            
    
    except Exception as e:
        print(f"Error creating embedding: {e}")
        return None
    
def store_embedding_in_dynamodb(product_id, embedding):
    """
    Store image embedding in DynamoDB
    
    Args:
        product_id (int): Product identifier
        embedding (str): Image embedding vector
    
    Returns:
        bool: Success status of DynamoDB insertion
    """
    table = dynamodb.Table('reverse_image_search')
    
    try:
        # Convert embedding list to a string for DynamoDB storage
        embedding_str = json.dumps(embedding)
        
        # Put item in DynamoDB
        response = table.put_item(
            Item={
                'product_id': product_id,
                'image_embedding': embedding_str
            }
        )
        return True
    except ClientError as e:
        print(f"Error storing embedding for product {product_id}: {e}")
        return False

def process_image_embeddings(dataframe):
    """
    Process image embeddings for a given dataframe with progress tracking
    
    Args:
        dataframe (pd.DataFrame): DataFrame with image URLs
    
    Returns:
        pd.DataFrame: DataFrame with image embeddings
    """
    # Lists to store results
    product_ids = []
    image_embeddings = []
    skipped_count = 0
    stored_count = 0
    
    # Create a progress bar
    total_rows = len(dataframe)
    
    # Process each row with progress bar
    for index, row in tqdm(dataframe.iterrows(), total=total_rows, desc="Processing Images"):
        try:
            # Modify URL to 448x448 resolution
            modified_url = modify_image_url(row['image_one'])
            
            # Download image
            image_data = download_image(modified_url)
            
            if image_data is None:
                skipped_count += 1
                continue
            
            # Base64 encode
            base64_encoded_image = base64.b64encode(image_data).decode('utf-8')
            
            # Create embedding
            embedding = create_image_embedding(base64_encoded_image)
            
            if embedding is not None:
                # Store in DynamoDB
                if store_embedding_in_dynamodb(row['product_id'], embedding):
                    product_ids.append(row['product_id'])
                    image_embeddings.append(embedding)
                    stored_count += 1
        
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            skipped_count += 1
    
    # Create result DataFrame
    result_df = pd.DataFrame({
        'product_id': product_ids,
        'image_embedding': image_embeddings
    })
    
    # Print summary
    print(f"\nTotal rows processed: {total_rows}")
    print(f"Successful embeddings: {len(result_df)}")
    print(f"Stored in DynamoDB: {stored_count}")
    print(f"Skipped rows: {skipped_count}")
    
    return result_df


In [18]:
df = pd.read_csv('sample_dataset.csv')
df = df.drop_duplicates(subset='product_id', keep='first')

In [20]:
# Process embeddings
embeddings_df = process_image_embeddings(df)

# Display or save results
print(embeddings_df.head())

Processing Images: 100%|██████████| 60/60 [00:08<00:00,  7.40it/s]


Total rows processed: 60
Successful embeddings: 60
Stored in DynamoDB: 60
Skipped rows: 0
   product_id                                    image_embedding
0         385  [0.004117468, -0.013859891, 0.0027899782, -0.0...
1         428  [0.011929379, -0.00026768225, 0.012019073, -0....
2         451  [0.007898778, 0.007898778, -0.0014391841, -0.0...
3         455  [0.041136667, 0.0032840197, 0.016765784, -0.00...
4         467  [0.012511388, 0.011890256, -0.011136023, -0.01...





In [19]:
print(df.head())

    product_id category sub_category sub_sub_category main_color  \
0          385    Women      Dresses              NaN      Black   
4          428    Women      Dresses              NaN      Black   
7          451    Women      Dresses              NaN     Silver   
10         455    Women      Dresses              NaN   8,61,850   
14         467    Women      Dresses              NaN        Red   

                                            image_one  
0   https://d1it09c4puycyh.cloudfront.net/355x503/...  
4   https://d1it09c4puycyh.cloudfront.net/355x503/...  
7   https://d1it09c4puycyh.cloudfront.net/355x503/...  
10  https://d1it09c4puycyh.cloudfront.net/355x503/...  
14  https://d1it09c4puycyh.cloudfront.net/355x503/...  
