In [None]:
from   collections import defaultdict
import hashlib
import re
import time

from   datalabs.access.aws import AWSClient
import pandas

In [None]:
import json
with open("snomed_cpt_mappings.json") as file:
    items = json.loads(file.read())

# Loader

## Create Hash Items

In [None]:
hashed_items = items

for item in items:
    if item["sk"].startswith("UNMAPPABLE:") or item["sk"].startswith("CPT:"):
        md5 = hashlib.md5(json.dumps(item, sort_keys=True).encode('utf-8')).hexdigest()
        hashed_items.append(dict(pk=f'{item["pk"]}:{item["sk"]}', sk=f"MD5:{md5}"))

hashed_items

## Verify No Duplicates

In [None]:
from bisect import bisect_left

In [None]:
json_items = sorted([json.dumps(item, sort_keys=True) for item in hashed_items])
unique_items = set(json.dumps(item, sort_keys=True) for item in hashed_items)

duplicate_json_items = []

for item in unique_items:
    index = bisect_left(json_items, item)
    if index < (len(json_items)-1) and json_items[index+1] == item:
        duplicate_json_items.append(item)

duplicate_json_items

## Initial Load

In [None]:
def write_items(items: list, table):
    with table.batch_writer() as batch:
        for item in hashed_items:
            batch.put_item(Item=item)


In [None]:
start_time = time.perf_counter()

with AWSClient("dynamodb").resource as db:
    table = db.Table("CPT-API-snomed-dev")

    write_items(items, table)

time.perf_counter() - start_time

## Fetch Hashes

In [None]:
def paginate(db, statement):
    results = db.execute_statement(Statement=statement, Limit=600000)

    for item in results["Items"]:
        yield item
    
    while "NextToken" in results:
        results = db.execute_statement(Statement=statement, Limit=600000, NextToken=results["NextToken"])
        
        for item in results["Items"]:
            yield item

In [None]:
start_time = time.perf_counter()

with AWSClient("dynamodb") as dynamodb:
    results = paginate(dynamodb, "SELECT * FROM \"CPT-API-snomed-sbx\".\"SearchIndex\" WHERE begins_with(\"sk\", 'MD5:')")

    results = list(results)

print(time.perf_counter() - start_time)
results

In [None]:
current_hashes_columns = defaultdict(list)

for result in results:
    for key, value in result.items():
        current_hashes_columns[key].append(value["S"])

current_hashes = pandas.DataFrame(current_hashes_columns)
current_hashes

## Mock Incoming Hashes

In [None]:
incoming_hashes = current_hashes.copy()

incoming_hashes = pandas.concat((
    incoming_hashes,
    pandas.DataFrame(
        dict(
            sk=["MD5:1234567890abcdefghijklmnopqrstuv", "MD5:abcdefghijklmnopqurstvwxyz123456", "MD5:d4de199db48813dc0e4133a480aaf6b8", "MD5:388936416f61c9255b4e4842764faf66"],
            pk=['CONCEPT:123456789:CPT:12345', 'CONCEPT:987654321:CPT:54321', incoming_hashes.pk[incoming_hashes.sk == "MD5:133a480aaf6b8d4de199db48813dc0e4"].iloc[0], incoming_hashes.pk[incoming_hashes.sk == "MD5:4e4842764faf66388936416f61c9255b"].iloc[0]]
        )
    )
))

incoming_hashes = incoming_hashes[~incoming_hashes.sk.isin(("MD5:133a480aaf6b8d4de199db48813dc0e4", "MD5:4e4842764faf66388936416f61c9255b", "MD5:adfec4b4db82f70b2fc7d4c6f261cc7b", "MD5:d176d88ac597ec61d825ff50bd02e02f"))]

incoming_hashes

## Sift Hash Records

In [None]:
start_time = time.perf_counter()

deleted_hashes = current_hashes[~current_hashes.pk.isin(incoming_hashes_df.pk)]

new_or_updated_hashes = incoming_hashes[~incoming_hashes.sk.isin(current_hashes.sk)]

new_hashes = new_or_updated_hashes[~new_or_updated_hashes.pk.isin(current_hashes.pk)]

updated_hashes = new_or_updated_hashes[new_or_updated_hashes.pk.isin(current_hashes.pk)]

print(time.perf_counter() - start_time)

In [None]:
deleted_hashes

In [None]:
new_hashes

In [None]:
updated_hashes

## Get Deleted Keywords for Hashes

In [None]:
start_time = time.perf_counter()
deleted_keywords = []

with AWSClient("dynamodb") as dynamodb:
    for pk in deleted_hashes.pk:
        results = paginate(dynamodb, f"SELECT * FROM \"CPT-API-snomed-sbx\" WHERE pk = '{pk}' AND begins_with(\"sk\", 'KEYWORD:')")

    deleted_keywords.append(list(results))

print(time.perf_counter() - start_time)
deleted_keywords