Skip to content

Commit

Permalink
feat: add check for _id in existing metadata.nd.json (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
helen-m-lin committed May 31, 2024
1 parent 04eb5f6 commit 5b8098a
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 21 deletions.
47 changes: 26 additions & 21 deletions src/aind_data_asset_indexer/aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ def _process_prefix(
Processes a prefix in S3.
# If metadata record exists in S3 and DocDB, do nothing.
# If record is in S3 but not DocDb, then copy it to DocDb if the
# location in the metadata record matches the actual location.
# Otherwise, log a warning.
# location in the metadata record matches the actual location and
# the record has an _id field. Otherwise, log a warning.
# If record does not exist in both DocDB and S3, build a new metadata
# file and save it to S3 (assume Lambda function will save to DocDB).
# In both cases above, we also copy the original core json files to a
Expand Down Expand Up @@ -285,25 +285,30 @@ def _process_prefix(
collection = db[
self.job_settings.doc_db_collection_name
]
response = collection.update_one(
{"_id": json_contents["_id"]},
{"$set": json_contents},
upsert=True,
)
logging.info(response.raw_result)
# ensure core jsons are synced with metadata.nd.json
copy_then_overwrite_core_json_files(
metadata_json=json.dumps(
json_contents, default=str
),
bucket=bucket,
prefix=s3_prefix,
s3_client=s3_client,
log_flag=True,
copy_original_md_subdir=(
self.job_settings.copy_original_md_subdir
),
)
if "_id" in json_contents:
response = collection.update_one(
{"_id": json_contents["_id"]},
{"$set": json_contents},
upsert=True,
)
logging.info(response.raw_result)
copy_then_overwrite_core_json_files(
metadata_json=json.dumps(
json_contents, default=str
),
bucket=bucket,
prefix=s3_prefix,
s3_client=s3_client,
log_flag=True,
copy_original_md_subdir=(
self.job_settings.copy_original_md_subdir
),
)
else:
logging.warning(
f"Metadata record for {location} "
f"does not have an _id field!"
)
else:
logging.warning(
f"Location field in record "
Expand Down
54 changes: 54 additions & 0 deletions tests/test_aind_bucket_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,60 @@ def test_process_prefix_no_record_yes_file_good_file(
)
mock_upload_metadata_json_str_to_s3.assert_not_called()

@patch(
"aind_data_asset_indexer.aind_bucket_indexer."
"upload_metadata_json_str_to_s3"
)
@patch(
"aind_data_asset_indexer.aind_bucket_indexer."
"copy_then_overwrite_core_json_files"
)
@patch(
"aind_data_asset_indexer.aind_bucket_indexer."
"download_json_file_from_s3"
)
@patch("aind_data_asset_indexer.aind_bucket_indexer.does_s3_object_exist")
@patch("aind_data_asset_indexer.aind_bucket_indexer.MongoClient")
@patch("boto3.client")
@patch("logging.warning")
def test_process_prefix_no_record_yes_file_good_file_no__id(
self,
mock_log_warn: MagicMock,
mock_s3_client: MagicMock,
mock_docdb_client: MagicMock,
mock_does_s3_object_exist: MagicMock,
mock_download_json_file_from_s3: MagicMock,
mock_copy_then_overwrite_core_json_files: MagicMock,
mock_upload_metadata_json_str_to_s3: MagicMock,
):
"""Tests _process_prefix method when there is no record in DocDb,
there is and there is metadata.nd.json file in S3, and the file can
be serialized to json, but there is no _id in the file."""
mock_db = MagicMock()
mock_docdb_client.__getitem__.return_value = mock_db
mock_collection = MagicMock()
mock_db.__getitem__.return_value = mock_collection

mock_does_s3_object_exist.return_value = True
mocked_downloaded_record = deepcopy(self.example_md_record)
del mocked_downloaded_record["_id"]
mock_download_json_file_from_s3.return_value = mocked_downloaded_record

location_to_id_map = dict()
self.basic_job._process_prefix(
s3_prefix="ecephys_642478_2023-01-17_13-56-29",
docdb_client=mock_docdb_client,
s3_client=mock_s3_client,
location_to_id_map=location_to_id_map,
)
mock_collection.assert_not_called()
mock_copy_then_overwrite_core_json_files.assert_not_called()
mock_upload_metadata_json_str_to_s3.assert_not_called()
mock_log_warn.assert_called_once_with(
"Metadata record for s3://aind-ephys-data-dev-u5u0i5/"
"ecephys_642478_2023-01-17_13-56-29 does not have an _id field!"
)

@patch(
"aind_data_asset_indexer.aind_bucket_indexer."
"upload_metadata_json_str_to_s3"
Expand Down

0 comments on commit 5b8098a

Please sign in to comment.