In [1]:
from utils import create_co_client, get_co_files, get_doc_db_records
import logging
from datetime import datetime
import boto3
from botocore import UNSIGNED
from botocore.config import Config

from aind_data_schema.core.data_description import DataDescription

In [2]:
log_file_name = "/results/log_" + datetime.now().strftime("%Y%m%d_%H%M%S") + ".log"
error_file_name = "/results/error_log_" + datetime.now().strftime("%Y%m%d_%H%M%S") + ".log"
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(log_file_name)
fh.setLevel(logging.DEBUG)

# create console handler, can set the level to info or warning if desired
# You can remove the console handler if you don't want to see these messages in the
# notebook.
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
fh.setFormatter(formatter)
# add the handlers to logger
logger.addHandler(ch)
logger.addHandler(fh)

In [3]:
docdb_query = { "data_description.name": { "$regex": "ecephys", "$options": "i" } }

docdb_records = get_doc_db_records(filter_query=docdb_query)

2024-02-02 22:14:43,592 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): api.allenneuraldynamics.org:443
2024-02-02 22:14:43,998 - urllib3.connectionpool - DEBUG - https://api.allenneuraldynamics.org:443 "GET /v1/metadata/data_assets?limit=0&skip=0&filter=%7B%22data_description.name%22%3A+%7B%22%24regex%22%3A+%22ecephys%22%2C+%22%24options%22%3A+%22i%22%7D%7D&projection=%7B%22_name%22%3A+1%2C+%22_created%22%3A+1%2C+%22_location%22%3A+1%2C+%22data_description.name%22%3A+1%7D HTTP/1.1" 200 493329


In [4]:
co_client = create_co_client()

co_files = get_co_files(co_client, docdb_records.keys())

weird_names = []
for file_key in co_files.keys():
    file = co_files[file_key]
    print(file)
    if "ecephys" not in file['name']:
        print("wrong name: ", file['name'])
        weird_names.append(file['name'])

present_ids = [db_id for db_id in docdb_records.keys() if db_id in co_files.keys()]
logging.critical(f'missing ids: {present_ids}')

2024-02-02 22:14:44,037 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): codeocean.allenneuraldynamics.org:443
2024-02-02 22:14:44,514 - urllib3.connectionpool - DEBUG - https://codeocean.allenneuraldynamics.org:443 "GET /api/v1/data_assets?start=0&limit=1000 HTTP/1.1" 200 None
2024-02-02 22:14:44,519 - aind-codeocean-api - INFO - https://codeocean.allenneuraldynamics.org/api/v1/data_assets?start=0&limit=1000
2024-02-02 22:14:44,953 - urllib3.connectionpool - DEBUG - https://codeocean.allenneuraldynamics.org:443 "GET /api/v1/data_assets?start=1000&limit=1000 HTTP/1.1" 200 None
2024-02-02 22:14:44,957 - aind-codeocean-api - INFO - https://codeocean.allenneuraldynamics.org/api/v1/data_assets?start=1000&limit=1000
2024-02-02 22:14:45,491 - urllib3.connectionpool - DEBUG - https://codeocean.allenneuraldynamics.org:443 "GET /api/v1/data_assets?start=2000&limit=1000 HTTP/1.1" 200 None
2024-02-02 22:14:45,495 - aind-codeocean-api - INFO - https://codeocean.allenneuraldyna

{'id': '0c3ec871-ba12-4c4c-85cf-0f970a384b7a', 'name': 'ecephys_692498_2024-01-17_14-52-53_sorted_2024-01-19_23-19-07', 'custom_metadata': {'data level': 'derived data', 'experiment type': 'ecephys', 'modality': 'Extracellular electrophysiology', 'subject id': '692498'}}
{'id': 'cec0bb4c-c997-4739-960f-05942e313cd4', 'name': 'ecephys_689650_2024-01-17_16-05-08_sorted_2024-01-19_22-27-16', 'custom_metadata': {'data level': 'derived data', 'experiment type': 'ecephys', 'modality': 'Extracellular electrophysiology', 'subject id': '689650'}}
{'id': '553ca886-c4b0-4693-a1c0-f9e74de9a865', 'name': 'ecephys_689650_2024-01-16_16-20-58_sorted_2024-01-19_22-07-57', 'custom_metadata': {'data level': 'derived data', 'experiment type': 'ecephys', 'modality': 'Extracellular electrophysiology', 'subject id': '689650'}}
{'id': '6630e47a-ff63-476a-9b4d-7a4bc8ff23e3', 'name': 'ecephys_693182_2024-01-16_15-08-21_sorted_2024-01-19_22-02-45', 'custom_metadata': {'data level': 'derived data', 'experiment ty

In [5]:
name_issues = []

for key in present_ids:
    if docdb_records[key].data_description['name'] != co_files[key]['name']:
        print(key)
        print("docdb: ", docdb_records[key].data_description['name'])
        print("location: ", docdb_records[key]._location)
        print("co: ", co_files[key]['name'])
        name_issues.append(key)
        
print(len(present_ids))
print(len(name_issues))

0c3ec871-ba12-4c4c-85cf-0f970a384b7a
docdb:  ecephys_692498_2024-01-17_14-52-53_sorted_2024-01-19_23-17-37
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/0c3ec871-ba12-4c4c-85cf-0f970a384b7a
co:  ecephys_692498_2024-01-17_14-52-53_sorted_2024-01-19_23-19-07
cec0bb4c-c997-4739-960f-05942e313cd4
docdb:  ecephys_689650_2024-01-17_16-05-08_sorted_2024-01-19_22-25-32
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/cec0bb4c-c997-4739-960f-05942e313cd4
co:  ecephys_689650_2024-01-17_16-05-08_sorted_2024-01-19_22-27-16
553ca886-c4b0-4693-a1c0-f9e74de9a865
docdb:  ecephys_689650_2024-01-16_16-20-58_sorted_2024-01-19_22-04-50
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/553ca886-c4b0-4693-a1c0-f9e74de9a865
co:  ecephys_689650_2024-01-16_16-20-58_sorted_2024-01-19_22-07-57
6630e47a-ff63-476a-9b4d-7a4bc8ff23e3
docdb:  ecephys_693182_2024-01-16_15-08-21_sorted_2024-01-19_22-01-47
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/6630e47a-ff63-476a-9b4d-7a4bc8ff23e3

co:  ecephys_668755_2023-08-28_13-06-40_sorted-ks2.5_2023-08-31_09-13-52
0235bbc4-ebb7-4529-8bd9-081d556773f7
docdb:  ecephys_644866_2023-02-07_17-08-15_Spike Sorting_2023-09-12_23-06-42
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/0235bbc4-ebb7-4529-8bd9-081d556773f7
co:  ecephys_644866_2023-02-07_17-08-15_sorted-ks2.5_2023-09-12_23-11-25
eaea9aea-fc63-453e-a6ec-2a738ea06c40
docdb:  ecephys_662892_2023-08-23_14-02-12_Spike Sorting_2023-08-27_18-36-52
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/eaea9aea-fc63-453e-a6ec-2a738ea06c40
co:  ecephys_662892_2023-08-23_14-02-12_sorted-ks2.5_2023-08-27_18-42-54
d527db85-39b7-4c4f-a465-9ca499b0ca47
docdb:  ecephys_662892_2023-08-21_12-43-45_Spike Sorting_2023-08-25_16-16-32
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/d527db85-39b7-4c4f-a465-9ca499b0ca47
co:  ecephys_662892_2023-08-21_12-43-45_sorted-ks2.5_2023-08-25_16-23-02
5c1bc1b5-4839-456b-a6c3-227ec0b1d40f
docdb:  ecephys_689383_2023-10-11_14-43-33_sorted_

937a691b-e6bd-46f8-a841-79da26d9bd7d
docdb:  ecephys_684804_2023-08-09_12-45-20_Spike Sorting_2023-08-19_19-23-07
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/937a691b-e6bd-46f8-a841-79da26d9bd7d
co:  ecephys_684804_2023-08-09_12-45-20_sorted-ks2.5_2023-08-19_19-24-06
eed8bc36-e900-43ee-80ef-f413fcc12c46
docdb:  ecephys_634571_2022-08-04_14-27-05
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/eed8bc36-e900-43ee-80ef-f413fcc12c46
co:  ecephys_634571_2022-08-04_14-27-05_sorted-ks25_2023-01-14_09-28-12
6f9b8c55-aa64-4f38-8787-245158df6d0e
docdb:  ecephys_699888_2023-11-06_15-31-29_sorted_2023-11-11_08-51-04
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/6f9b8c55-aa64-4f38-8787-245158df6d0e
co:  ecephys_699888_2023-11-06_15-31-29_sorted_2023-11-11_08-54-40
6bfa2b2f-e154-4b78-ba34-a96c78f9f36b
docdb:  ecephys_699888_2023-11-07_17-21-59_sorted_2023-11-11_08-13-34
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/6bfa2b2f-e154-4b78-ba34-a96c78f9f36b
co:  ece

co:  ecephys_634569_2022-08-09_16-14-38_sorted-ks25_2023-05-11_19-29-48
f1b3c2c3-1aca-4028-a0ae-0a4ca4b73dbd
docdb:  ecephys_634568_2022-08-05_15-59-46
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/f1b3c2c3-1aca-4028-a0ae-0a4ca4b73dbd
co:  ecephys_634568_2022-08-05_15-59-46_sorted-ks25_2023-05-11_15-54-29
cc649a26-cdd7-4403-9119-235822b6d157
docdb:  ecephys_625749_2022-08-03_15-15-06
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/cc649a26-cdd7-4403-9119-235822b6d157
co:  ecephys_625749_2022-08-03_15-15-06_sorted-ks25_2023-05-11_15-52-21
65921397-522c-4a2a-858d-8e41d6710596
docdb:  ecephys_625098_2022-08-15_08-51-36
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/65921397-522c-4a2a-858d-8e41d6710596
co:  ecephys_625098_2022-08-15_08-51-36_sorted_2023-05-11_15-43-44
e0d51e03-1b3d-4de3-a486-7969724a4357
docdb:  ecephys_699890_2023-12-15_12-24-26_sorted_2023-12-16_10-49-49
location:  s3://codeocean-s3datasetsbucket-1u41qdg42ur9/e0d51e03-1b3d-4de3-a486-7969724a435

In [19]:
bucket = "aind-open-data"

def update_names(keys, files):
        s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
        
        for problem_key in keys:
            print(key)
            problem_file = files[key]
            
            file_key = problem_file._name + "/data_description.json"
            print(file_key)
            s3_response = s3_client.get_object(Bucket=bucket, Key=file_key)
            text = s3_response["Body"].read().decode()
            data_description = DataDescription(json.loads(text))
            
            print("data_desc.name: ", data_description.name)
            print("co name: ", problem_file["name"])

#             body = data_description.json(indent=3).encode("utf-8")

#             if not self.configs.dryrun:
#                 s3_client.put_object(Body=body, Bucket=bucket, Key=file_key)
#             else:
#                 print(f"DRYRUN: s3_client.put_object(Body={body}, Bucket={bucket}, Key={file_key})")

        s3_client.close()

In [20]:
update_names(name_issues, docdb_records)

2024-02-02 22:32:48,279 - botocore.hooks - DEBUG - Event choose-service-name: calling handler <function handle_service_name_alias at 0x7f8df0082310>
2024-02-02 22:32:48,281 - botocore.hooks - DEBUG - Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7f8df01568b0>
2024-02-02 22:32:48,282 - botocore.hooks - DEBUG - Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7f8def5003a0>
2024-02-02 22:32:48,283 - botocore.hooks - DEBUG - Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7f8df0156670>
2024-02-02 22:32:48,283 - botocore.configprovider - DEBUG - Looking for endpoint for s3 via: environment_service
2024-02-02 22:32:48,284 - botocore.configprovider - DEBUG - Looking for endpoint for s3 via: environment_global
2024-02-02 22:32:48,285 - botocore.configprovider - DEBUG - Looking for endpoint for s3 via: config_service
2024-02-02 22:32:48,285 - botocore.configprovider -

2024-02-02 22:32:48,358 - botocore.retryhandler - DEBUG - No retry needed.
2024-02-02 22:32:48,358 - botocore.hooks - DEBUG - Event needs-retry.s3.GetObject: calling handler <bound method S3RegionRedirectorv2.redirect_from_error of <botocore.utils.S3RegionRedirectorv2 object at 0x7f8dec2c78e0>>


ddd3139b-d0d9-422d-8806-0f3cec7681ba
ecephys_667252_2023-09-28_15-00-38_sorted_2023-10-02_02-21-35/data_description.json


NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.