In [1]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting numpy (from imagehash)
  Downloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, imagehash
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4
[31mERRO

In [12]:
import boto3
import requests
import json
from decimal import Decimal
from requests.auth import HTTPBasicAuth

# DynamoDB and OpenSearch configuration
dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('LuluHashStore')
opensearch_url = 'https://search-imagehash-beqqt46rp2xv6agh7tohq5it7i.us-east-1.es.amazonaws.com'
index_name = 'phash_index'

# Authentication
auth = HTTPBasicAuth('admin', '1337@Open')

# Define the KNN index settings and mappings
create_index_body = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 512,
            "knn.algo_param.ef_construction": 512
        }
    },
    "mappings": {
        "properties": {
            "entity_id": {
                "type": "keyword"
            },
            "sku": {
                "type": "keyword"
            },
            "small_image": {
                "type": "keyword"
            },
            "phash": {
                "type": "knn_vector",
                "dimension": 64
            }
        }
    }
}

def check_index_exists():
    response = requests.get(f"{opensearch_url}/{index_name}", auth=auth)
    return response.status_code == 200

def create_index():
    if not check_index_exists():
        print(f"Creating index: {index_name}")
        response = requests.put(f"{opensearch_url}/{index_name}", auth=auth, json=create_index_body)
        if response.status_code == 200:
            print(f"Index {index_name} created successfully.")
        else:
            print(f"Failed to create index: {response.status_code} - {response.text}")
    else:
        print(f"Index {index_name} already exists.")

def hex_to_vector(hex_string):
    int_representation = int(hex_string, 16)
    binary_representation = bin(int_representation)[2:].zfill(64)
    vector = [int(bit) for bit in binary_representation]
    return vector

def convert_decimal_to_float(item):
    if isinstance(item, list):
        return [convert_decimal_to_float(i) for i in item]
    elif isinstance(item, dict):
        return {k: convert_decimal_to_float(v) for k, v in item.items()}
    elif isinstance(item, Decimal):
        return float(item)  # Convert Decimal to float
    else:
        return item

def send_bulk_request(bulk_data):
    headers = {"Content-Type": "application/json"}
    response = requests.post(f"{opensearch_url}/_bulk", auth=auth, headers=headers, data=bulk_data)
    
    if response.status_code == 200:
        print(f"Bulk request successful. Items indexed: {len(bulk_data.splitlines()) // 2}")
    else:
        print(f"Failed to send bulk request: {response.status_code} - {response.text}")

def export_data_to_opensearch():
    create_index()
    
    # pagination variables
    batch_size = 100  # no. of items to scan from DynamoDB in each iteration
    exclusive_start_key = None
    total_items_processed = 0

    # bulk request variables
    bulk_actions = []
    bulk_size_limit = 5 * 1024 * 1024  # 5 mb limit for bulk requests
    current_bulk_size = 0

    while True:
        if exclusive_start_key:
            response = table.scan(ExclusiveStartKey=exclusive_start_key, Limit=batch_size)
        else:
            response = table.scan(Limit=batch_size)
        
        items = response.get('Items', [])
        
        for item in items:
            try:
                phash_vector = hex_to_vector(item['phash'])
                item = convert_decimal_to_float(item)
                action = {
                    "index": {
                        "_index": index_name,
                        "_id": item['entity_id']
                    }
                }
                data = {
                    "entity_id": item['entity_id'],
                    "sku": item['sku'],
                    "small_image": item['small_image'],
                    "phash": phash_vector
                }
                
                bulk_actions.extend([json.dumps(action), json.dumps(data)])
                current_bulk_size += len(json.dumps(action)) + len(json.dumps(data))
                
                if current_bulk_size >= bulk_size_limit:
                    send_bulk_request("\n".join(bulk_actions) + "\n")
                    bulk_actions = []
                    current_bulk_size = 0
            
            except KeyError as e:
                print(f"Skipping item due to missing key: {e}")
                continue
        
        total_items_processed += len(items)
        print(f"Processed {total_items_processed} items so far")
        
        exclusive_start_key = response.get('LastEvaluatedKey')
        if not exclusive_start_key:
            break
    
    if bulk_actions:
        send_bulk_request("\n".join(bulk_actions) + "\n")
    
    print(f"Total items processed: {total_items_processed}")

# view data in OpenSearch
def view_opensearch_data(size=10):
    query = {
        "query": {
            "match_all": {}
        },
        "size": size
    }
    
    headers = {"Content-Type": "application/json"}
    response = requests.get(f"{opensearch_url}/{index_name}/_search", 
                            auth=auth, 
                            headers=headers, 
                            data=json.dumps(query))
    
    if response.status_code == 200:
        results = response.json()
        print(f"Total hits: {results['hits']['total']['value']}")
        for hit in results['hits']['hits']:
            print(json.dumps(hit['_source'], indent=2))
    else:
        print(f"Failed to retrieve data: {response.status_code} - {response.text}")

# export data function call
export_data_to_opensearch()

Creating index: phash_index2
Index phash_index2 created successfully.
Processed 100 items so far
Processed 100 items so far
Bulk request successful. Items indexed: 100
Total items processed: 100


In [13]:
def view_opensearch_data(size=10):
    query = {
        "query": {
            "match_all": {}
        },
        "size": size
    }
    
    headers = {"Content-Type": "application/json"}
    response = requests.get(f"{opensearch_url}/{index_name}/_search", 
                            auth=auth, 
                            headers=headers, 
                            data=json.dumps(query))
    
    if response.status_code == 200:
        results = response.json()
        print(f"Total hits: {results['hits']['total']['value']}")
        for hit in results['hits']['hits']:
            print(json.dumps(hit['_source'], indent=2))
    else:
        print(f"Failed to retrieve data: {response.status_code} - {response.text}")

In [14]:
view_opensearch_data()

Total hits: 100
{
  "entity_id": 433359.0,
  "sku": "5317-1-RED",
  "small_image": "/5/3/5317-1-RED_1.jpg",
  "phash": [
    1,
    0,
    1,
    1,
    1,
    0,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    0,
    0,
    1,
    0,
    1,
    0,
    0,
    1,
    0,
    1,
    1,
    1,
    0,
    0,
    0,
    1,
    1,
    0,
    1,
    0,
    0,
    1,
    1,
    0,
    0,
    1,
    0,
    1,
    0,
    0,
    1,
    1,
    1,
    1,
    0,
    1,
    1,
    0,
    0,
    1,
    0,
    0,
    1,
    0,
    1,
    1,
    0,
    0,
    0,
    1,
    0
  ]
}
{
  "entity_id": 433649.0,
  "sku": "22891-WHITE",
  "small_image": "/2/2/22891-WHITE_1.jpg",
  "phash": [
    1,
    0,
    1,
    1,
    1,
    0,
    1,
    0,
    1,
    1,
    0,
    0,
    0,
    0,
    1,
    1,
    0,
    0,
    1,
    1,
    0,
    0,
    1,
    1,
    0,
    1,
    1,
    0,
    1,
    0,
    0,
    0,
    0,
    0,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    1,
    0,
    0,
    0,
    