In [7]:
import boto3
import requests
import json
from decimal import Decimal
from requests.auth import HTTPBasicAuth

# Custom JSON encoder to handle Decimal types
class DecimalEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)
        return super(DecimalEncoder, self).default(obj)

# OpenSearch Configuration
opensearch_url = 'https://search-reverseimagesearch-j3nx2t2f42fy7wfayhbh3zyenq.aos.us-east-1.on.aws'  # Replace with your OpenSearch endpoint
index_name = 'image_embeddings_index'  # Choose a descriptive index name
auth = HTTPBasicAuth('admin', '1337@Open')  # Replace with your credentials

# DynamoDB Configuration
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('reverse_image_search')  # Your DynamoDB table name

# OpenSearch Index Creation Configuration
create_index_body = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 512,
            "knn.algo_param.ef_construction": 512
        }
    },
    "mappings": {
        "properties": {
            "product_id": {
                "type": "long"  # Matching your DynamoDB primary key type
            },
            "vector": {
                "type": "knn_vector",
                "dimension": 1024,  # Adjust based on your embedding size
                "method": {
                    "name": "hnsw",  # Use hnsw instead of nmslib
                    "space_type": "l2"  # Euclidean distance
                }
            }
        }
    }
}

def convert_embedding(embedding):
    """
    Convert embedding to a list of floats
    """
    if isinstance(embedding, str):
        try:
            # Parse string and convert Decimals to floats
            parsed_embedding = json.loads(
                embedding, 
                parse_float=float
            )
            return parsed_embedding
        except json.JSONDecodeError:
            print(f"Failed to parse embedding: {embedding}")
            return None
    
    if isinstance(embedding, list):
        # Convert any Decimal types to float
        return [float(x) if isinstance(x, Decimal) else x for x in embedding]
    
    print(f"Unexpected embedding type: {type(embedding)}")
    return None

def export_embeddings_to_opensearch():
    """
    Export embeddings from DynamoDB to OpenSearch
    """
    create_index()
    
    # Pagination variables
    batch_size = 100  # number of items to scan from DynamoDB in each iteration
    exclusive_start_key = None
    total_items_processed = 0
    total_items_indexed = 0

    # Bulk request variables
    bulk_actions = []
    bulk_size_limit = 5 * 1024 * 1024  # 5 mb limit for bulk requests
    current_bulk_size = 0

    while True:
        # Scan DynamoDB table
        if exclusive_start_key:
            response = table.scan(
                ExclusiveStartKey=exclusive_start_key, 
                Limit=batch_size
            )
        else:
            response = table.scan(Limit=batch_size)
        
        items = response.get('Items', [])
        
        for item in items:
            try:
                # Convert embedding
                embedding = convert_embedding(item.get('image_embedding'))
                
                if embedding is None or len(embedding) == 0:
                    print(f"Skipping item with invalid embedding: {item}")
                    continue
                
                # Prepare OpenSearch bulk indexing actions
                action = {
                    "index": {
                        "_index": index_name,
                        "_id": str(item['product_id'])
                    }
                }
                
                # Prepare data for indexing
                data = {
                    "product_id": item['product_id'],
                    "vector": embedding
                }
                
                # Add to bulk actions using custom JSON encoder
                bulk_actions.extend([
                    json.dumps(action), 
                    json.dumps(data, cls=DecimalEncoder)
                ])
                current_bulk_size += len(bulk_actions[-2]) + len(bulk_actions[-1])
                
                # Send bulk request if size limit is reached
                if current_bulk_size >= bulk_size_limit:
                    send_bulk_request("\n".join(bulk_actions) + "\n")
                    bulk_actions = []
                    current_bulk_size = 0
                
                total_items_indexed += 1
            
            except Exception as e:
                print(f"Error processing item: {e}")
                continue
        
        total_items_processed += len(items)
        print(f"Processed {total_items_processed} items so far")
        
        # Check for more items to process
        exclusive_start_key = response.get('LastEvaluatedKey')
        if not exclusive_start_key:
            break
    
    # Send any remaining items
    if bulk_actions:
        send_bulk_request("\n".join(bulk_actions) + "\n")
    
    print(f"Total items processed: {total_items_processed}")
    print(f"Total items indexed: {total_items_indexed}")

def create_index():
    """
    Create OpenSearch index if it doesn't exist
    """
    if not check_index_exists():
        print(f"Creating index: {index_name}")
        response = requests.put(
            f"{opensearch_url}/{index_name}", 
            auth=auth, 
            json=create_index_body,
            headers={"Content-Type": "application/json"}
        )
        if response.status_code in [200, 201]:
            print(f"Index {index_name} created successfully.")
        else:
            print(f"Failed to create index: {response.status_code} - {response.text}")
    else:
        print(f"Index {index_name} already exists.")

def check_index_exists():
    """
    Check if the OpenSearch index already exists
    """
    try:
        response = requests.get(f"{opensearch_url}/{index_name}", auth=auth)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking index: {e}")
        return False

def send_bulk_request(bulk_data):
    """
    Send bulk indexing request to OpenSearch
    """
    headers = {"Content-Type": "application/json"}
    response = requests.post(
        f"{opensearch_url}/_bulk", 
        auth=auth, 
        headers=headers, 
        data=bulk_data
    )
    
    if response.status_code == 200:
        print(f"Bulk request successful. Items indexed: {len(bulk_data.splitlines()) // 2}")
    else:
        print(f"Failed to send bulk request: {response.status_code} - {response.text}")

In [8]:
export_embeddings_to_opensearch()

Index image_embeddings_index already exists.
Processed 60 items so far
Bulk request successful. Items indexed: 60
Total items processed: 60
Total items indexed: 60


In [9]:
view_opensearch_data()

Total hits: 60
{
  "product_id": 978.0,
  "vector": [
    0.026627311,
    0.03826574,
    0.017193131,
    -0.008508396,
    0.016311433,
    0.03667868,
    -0.022395156,
    -0.01789849,
    0.042850573,
    -0.062776975,
    0.039500117,
    0.0021160778,
    -0.02856705,
    0.0034606687,
    -0.00017771746,
    0.023276854,
    0.010359964,
    0.00943418,
    -0.007274017,
    0.0043203253,
    0.072652005,
    0.03685502,
    0.02856705,
    -0.0018736105,
    0.015870582,
    -0.0010194646,
    0.04655371,
    -0.04690639,
    0.029801428,
    -0.034033585,
    0.009919114,
    -0.0081116315,
    0.04902247,
    0.0006254553,
    -0.041087177,
    -0.050080508,
    -0.0044746227,
    -0.0085524805,
    0.023805875,
    0.0058192136,
    0.033680905,
    0.006304148,
    -0.009566435,
    0.034033585,
    -0.03667868,
    0.010007285,
    0.014459864,
    -0.012960976,
    -0.011373918,
    0.007979376,
    0.010139539,
    0.0012949955,
    0.05607606,
    -0.012960976,
    -0