Skip to content

Commit

Permalink
feat: do not copy to /original_metadata if already exists (#69)
Browse files Browse the repository at this point in the history
* refactor: remove redundant code

* feat: do not copy to /original_metadata if already copied

* remove extra import

* test: remove redundant unit test

* fix incorrect method name

* ci: pin pydantic version <2.7

* test: unit test for build_metadata_record_from_prefix exception

* docs: runs linters

---------

Co-authored-by: jtyoung84 <104453205+jtyoung84@users.noreply.github.com>
  • Loading branch information
helen-m-lin and jtyoung84 committed Jun 7, 2024
1 parent f87cbb6 commit 912f658
Show file tree
Hide file tree
Showing 12 changed files with 451 additions and 565 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies = [
"boto3",
"boto3-stubs[s3]",
"pydantic-settings>=2.0",
"pydantic>=2.0",
"pydantic>=2.0,<2.7",
"pymongo==4.3.3",
"dask==2023.5.0",
"aind-data-schema==0.33.3",
Expand Down
43 changes: 11 additions & 32 deletions src/aind_data_asset_indexer/aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,9 @@
build_docdb_location_to_id_map,
build_metadata_record_from_prefix,
compute_md5_hash,
copy_then_overwrite_core_json_files,
cond_copy_then_sync_core_json_files,
create_metadata_object_key,
does_s3_object_exist,
does_s3_prefix_exist,
download_json_file_from_s3,
get_dict_of_file_info,
get_s3_bucket_and_prefix,
Expand All @@ -30,7 +29,6 @@
is_record_location_valid,
iterate_through_top_level,
paginate_docdb,
sync_core_json_files,
upload_metadata_json_str_to_s3,
)

Expand Down Expand Up @@ -123,35 +121,16 @@ def _process_docdb_record(
else s3_object_info["e_tag"].strip('"')
)
if record_md5_hash != s3_object_hash:
copy_exists_in_s3 = does_s3_prefix_exist(
s3_client=s3_client,
cond_copy_then_sync_core_json_files(
metadata_json=record_as_json_str,
bucket=s3_bucket,
prefix=f"{prefix}/original_metadata",
prefix=prefix,
s3_client=s3_client,
log_flag=True,
copy_original_md_subdir=(
self.job_settings.copy_original_md_subdir
),
)
if copy_exists_in_s3:
# if /original_metadata exists, then we only need to
# overwrite the top-level jsons of updated core_fields
sync_core_json_files(
metadata_json=record_as_json_str,
bucket=s3_bucket,
prefix=prefix,
s3_client=s3_client,
log_flag=True,
)
else:
# if /original_metadata does not exist, then we need
# to copy and overwrite all the core jsons using the
# new metadata.nd.json
copy_then_overwrite_core_json_files(
metadata_json=record_as_json_str,
bucket=s3_bucket,
prefix=prefix,
s3_client=s3_client,
log_flag=True,
copy_original_md_subdir=(
self.job_settings.copy_original_md_subdir
),
)
logging.info(
f"Uploading metadata record for: "
f"{docdb_record['location']}"
Expand Down Expand Up @@ -301,7 +280,7 @@ def _process_prefix(
upsert=True,
)
logging.info(response.raw_result)
copy_then_overwrite_core_json_files(
cond_copy_then_sync_core_json_files(
metadata_json=json.dumps(
json_contents, default=str
),
Expand Down Expand Up @@ -345,7 +324,7 @@ def _process_prefix(
)
if new_metadata_contents is not None:
# noinspection PyTypeChecker
copy_then_overwrite_core_json_files(
cond_copy_then_sync_core_json_files(
metadata_json=new_metadata_contents,
bucket=bucket,
prefix=s3_prefix,
Expand Down
8 changes: 2 additions & 6 deletions src/aind_data_asset_indexer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,7 @@ def from_param_store(cls, param_store_name: str):
parameters = response["Parameter"]["Value"]
parameters_json = json.loads(parameters)
if "doc_db_secret_name" not in parameters:
raise ValueError(
"doc_db_secret_name not found in parameters."
)
raise ValueError("doc_db_secret_name not found in parameters.")
secrets_client = boto3.client("secretsmanager")
docdb_secret = secrets_client.get_secret_value(
SecretId=parameters_json["doc_db_secret_name"]
Expand All @@ -91,9 +89,7 @@ def from_param_store(cls, param_store_name: str):

for secret_key, job_setting in secret_to_job_settings_map.items():
if secret_key not in docdb_secret_json:
raise ValueError(
f"{secret_key} not found in docdb secret."
)
raise ValueError(f"{secret_key} not found in docdb secret.")
parameters_json[job_setting] = docdb_secret_json[secret_key]
return cls.model_validate_json(json.dumps(parameters_json))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from aind_data_asset_indexer.models import IndexJobSettings
from aind_data_asset_indexer.utils import (
build_metadata_record_from_prefix,
copy_then_overwrite_core_json_files,
cond_copy_then_sync_core_json_files,
get_s3_location,
is_prefix_valid,
iterate_through_top_level,
Expand Down Expand Up @@ -73,7 +73,7 @@ def _process_prefix(self, prefix: str, s3_client: S3Client):
bucket=bucket,
)
if md_record is not None:
copy_then_overwrite_core_json_files(
cond_copy_then_sync_core_json_files(
metadata_json=md_record,
bucket=bucket,
prefix=prefix,
Expand Down
Loading

0 comments on commit 912f658

Please sign in to comment.