## Collect newly deposited datasets, add them to the MongoDB instance 

### Load data

In [15]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [16]:
collection = load_data()

### Find latest date of deposit

In [17]:
# Find the latest entry for the field "ore:describes.dateOfDeposit"
latest_entry = collection.find_one(
    {"ore:describes.dateOfDeposit": {"$exists": True}},
    sort=[("ore:describes.dateOfDeposit", -1)]
)
print(latest_entry)

{'_id': ObjectId('67ee8b3a3b1fa0355f16a5c4'), 'dcterms:modified': '2025-04-02', 'dcterms:creator': 'DANS Data Station Archaeology', '@type': 'ore:ResourceMap', 'schema:additionalType': 'Dataverse OREMap Format v1.0.0', 'dvcore:generatedBy': {'@type': 'schema:SoftwareApplication', 'schema:name': 'Dataverse', 'schema:version': '6.3 build DANS-DataStation-PATCH-10', 'schema:url': 'https://github.com/iqss/dataverse'}, '@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/8PRDNI', 'ore:describes': {'author': {'citation:authorName': 'Walcherse Archeologische Dienst'}, 'citation:distributor': {'citation:distributorName': 'Walcherse Archeologische Dienst'}, 'citation:otherId': [{'citation:otherIdValue': '47582271000 (vondstmeldingsnummer)'}, {'citation:otherIdValue': '0 (waarnemingsnummer)'}, {'citation:otherIdValue': 'VLIS_019_004 (projectcode)'}, {'citation:otherIdAgency': 'ARCHEO', 'citation:otherIdValue': '73523551_b0445a9

In [18]:
latest_entry["ore:describes"]["dateOfDeposit"]

'2025-04-02'

### Extract all metadata for datasets after latest_entry 
Query `q_2025` selects all datasets deposited to the Archaeology Data Station between 2025-03-28 and 2025-04-03 (date of writing), with the Published status. 


In [10]:
min_date = "2025-03-28"
max_date = "2025-04-03"

In [11]:
q_2025 = f"https://archaeology.datastations.nl/api/search?q=*&per_page=1000&type=dataset&show_facets=true&fq=publicationDate:2025&fq=dateSort:[{min_date}T00:00:00Z+TO+{max_date}T00:00:00Z]&fq=publicationStatus:Published"

In [12]:
q_2025

'https://archaeology.datastations.nl/api/search?q=*&per_page=1000&type=dataset&show_facets=true&fq=publicationDate:2025&fq=dateSort:[2025-03-28T00:00:00Z+TO+2025-04-03T00:00:00Z]&fq=publicationStatus:Published'

In [13]:
## Automate the process of fetching all the data from the API
dois = []

import urllib
import json
base = q_2025
rows = 1000
start = 0
page = 1
condition = True # emulate do-while
while (condition):
    url = base + "&start=" + str(start)
    data = json.load(urllib.request.urlopen(url))
    total = data['data']['total_count']
    print("=== Page", page, "===")
    print("start:", start, " total:", total)
    for i in data['data']['items']:
        #print("- ", i['name'], "(" + i['type'] + ")")
        doi = i['url']
        dois.append(doi)
    start = start + rows
    page += 1
    condition = start < total

=== Page 1 ===
start: 0  total: 138


In [14]:
# Write to .txt file 
with open('../../data/dois_2025-04-03.txt', 'w') as f:
    for item in dois:
        f.write("%s\n" % item)