# Sprint 1: Identify missing coordinate schemes 


## Read all data 
Make sure the collected data is up-to-date. Run [`find_new_deposits.ipynb`](../collect_data/find_new_deposits.ipynb) and [`collect_metadata.ipynb`](../collect_data/collect_metadata.ipynb) first to download metadata until current date. 

In [114]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [115]:
collection = load_data()


## Identify instances where the Scheme is missing but not the points. 

In [97]:
# For Points: 
#fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]
#fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme"]

# For Box: 
fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]
fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme"]

In [98]:
query = {
    "$and":[
        {"$or": [{x: {"$exists": True} for x in fields_present}]},
        {"$or": [{x: {"$exists": False} for x in fields_absent}]}
    ], 
}

projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        '_id': 0
    }



In [99]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/JIHOJG', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '402735', 'dansTemporalSpatial:dansSpatialBoxEast': '148400', 'dansTemporalSpatial:dansSpatialBoxSouth': '402686', 'dansTemporalSpatial:dansSpatialBoxWest': '148311'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/GFWYG9', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '425246', 'dansTemporalSpatial:dansSpatialBoxEast': '171940', 'dansTemporalSpatial:dansSpatialBoxSouth': '425194', 'dansTemporalSpatial:dansSpatialBoxWest': '171908'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/WCWGWE', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemp

In [100]:
import json
with open("../../data/sprint1/sp1_no_box_scheme.json", "w") as outfile: 
    json.dump(data, outfile, indent=4)

## Identify cases where Points/Boxes are incomplete 

In [116]:
# Points 
points = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]

# Box
box = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]



In [125]:
# Note on the following query: 
# The * operator is used to unpack the list of dictionaries created by the list comprehension. 
# This is required because the $or operator expects a list of conditions, not a list of lists.

query = {
    "$and": [
        # At least one field exists in 'points' OR 'box'
        {
            "$or": [
                *[{field: {"$exists": True}} for field in points],  # Any field in 'points'
                *[{field: {"$exists": True}} for field in box]      # Any field in 'box'
            ]
        },
        # Exclude documents with ALL fields in 'points'
        {"$nor": [{field: {"$exists": True} for field in points}]},
        # Exclude documents with ALL fields in 'box'
        {"$nor": [{field: {"$exists": True} for field in box}]}
    ]
}


projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        '_id': 0
    }

In [126]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/HD2TLR', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '201463', 'dansTemporalSpatial:dansSpatialBoxEast': '357111', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/FRDQG8', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '121924', 'dansTemporalSpatial:dansSpatialBoxEast': '478749', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/JRFSCS', 'ore:describes': {'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '185031', 'dansTemporalSpatial:dansSpatialBoxEast': '422730'

In [120]:
with open("../../data/sprint1/sp1_incomplete.json", "w") as outfile: 
    json.dump(data, outfile, indent=4)