## Collect newly deposited datasets, add them to the MongoDB instance 

### Load data

In [1]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [2]:
collection = load_data()

### Find latest date of deposit

In [12]:
# Find the latest entry for the field "ore:describes.dateOfDeposit"
# convert the date to a datetime object and sort by the date in descending order
# then print the first entry
latest_entry = collection.find_one(
    {"ore:describes.dateOfDeposit": {"$exists": True}},
    sort=[("ore:describes.dateOfDeposit", -1)]
)
print(latest_entry)




{'_id': ObjectId('67a9c000ec45b2e66fe68613'), 'dcterms:modified': '2025-01-13', 'dcterms:creator': 'DANS Data Station Archaeology', '@type': 'ore:ResourceMap', 'schema:additionalType': 'Dataverse OREMap Format v1.0.0', 'dvcore:generatedBy': {'@type': 'schema:SoftwareApplication', 'schema:name': 'Dataverse', 'schema:version': '6.3 build DANS-DataStation-PATCH-9', 'schema:url': 'https://github.com/iqss/dataverse'}, '@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/95SGMI', 'ore:describes': {'dansDataVaultMetadata:dansNbn': 'urn:nbn:nl:ui:13-7cba5b1e-2225-473f-9ac1-eca94c097797', 'dansDataVaultMetadata:dansBagId': 'urn:uuid:c79c3a84-b81d-413d-a497-ca1e11b1934f', 'dansDataVaultMetadata:dansDataversePid': 'doi:10.17026/AR/95SGMI', 'dansDataVaultMetadata:dansDataversePidVersion': '1.0', 'citation:productionDate': '2025-01-13', 'dansTemporalSpatial:dansSpatialCoverageControlled': 'Netherlands', 'dansTemporalSpatial:dansSp

In [13]:
latest_entry["ore:describes"]["dateOfDeposit"]

'2025-01-13'

### Extract all metadata for datasets after latest_entry 
Query `q_2025` selects all datasets deposited to the Archaeology Data Station between 2025-01-13 and 2025-03-28 (date of writing), with the Published status. 


In [73]:
q_2025 = "https://archaeology.datastations.nl/api/search?q=*&per_page=1000&type=dataset&show_facets=true&fq=publicationDate:2025&fq=dateSort:[2025-01-13T00:00:00Z+TO+2025-03-27T00:00:00Z]&fq=publicationStatus:Published"

In [74]:
q_2025

'https://archaeology.datastations.nl/api/search?q=*&per_page=1000&type=dataset&show_facets=true&fq=publicationDate:2025&fq=dateSort:[2025-01-13T00:00:00Z+TO+2025-03-27T00:00:00Z]&fq=publicationStatus:Published'

In [None]:
## Automate the process of fetching all the data from the API

import urllib
import json
base = 'https://archaeology.datastations.nl'
rows = 10
start = 0
page = 1
condition = True # emulate do-while
while (condition):
    url = base + '/api/search?q=*' + "&start=" + str(start)
    data = json.load(urllib.request.urlopen(url))
    total = data['data']['total_count']
    print("=== Page", page, "===")
    print("start:", start, " total:", total)
    for i in data['data']['items']:
        print("- ", i['name'], "(" + i['type'] + ")")
    start = start + rows
    page += 1
    condition = start < total