# Sprint 1: Identify missing coordinate schemes 


## Read all data 
Make sure the collected data is up-to-date. Run [`find_new_deposits.ipynb`](../collect_data/find_new_deposits.ipynb) and [`collect_metadata.ipynb`](../collect_data/collect_metadata.ipynb) first to download metadata until current date. 

In [1]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [2]:
collection = load_data()


#### Print a sample 

In [27]:
## Print a sample of temporal spatial data
## Specify the coordinate scheme below


#scheme = "longitude/latitude (degrees)"
scheme = "RD (in m.)"


sample = collection.find({"ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme":f"{scheme}"}, {"_id": 0, "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX": 1, "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY": 1 ,"ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme": 1}).limit(5)

In [28]:
for l in sample: 
    print(l)
    print()


{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '108240', 'dansTemporalSpatial:dansSpatialPointY': '447370', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '77230', 'dansTemporalSpatial:dansSpatialPointY': '490214', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '203505', 'dansTemporalSpatial:dansSpatialPointY': '324100', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '95873', 'dansTemporalSpatial:dansSpatialPointY': '421799', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': [{'dansTemporalSpatial:dansSpatialPointX': 

## PART 1: IDENTIFY

## Identify instances where the Scheme is missing but not the points. 

In [25]:
# # For Points: 
fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]
fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme"]

# For Box: 
# fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]
# fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme"]

In [26]:
# Select the documents that contain a value for at least one of the fields in fields_present (points + boxes), and do not contain a value specifying the coordinate scheme 
query = {
    "$and":[
        {"$or": [{x: {"$exists": True} for x in fields_present}]},
        {"$and": [{x: {"$exists": False} for x in fields_absent}]}
    ], 
}

# Select the fields to be included in the output
projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        'ore:describes.author.citation:authorName': 1,
        '_id': 0
    }



In [27]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/FAJGJ9', 'ore:describes': {'author': {'citation:authorName': 'Schamp, C.R.C.'}, 'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '216.335', 'dansTemporalSpatial:dansSpatialPointY': '597.730'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/79FSNP', 'ore:describes': {'author': {'citation:authorName': 'Yoshua Csonka'}, 'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '245213.33', 'dansTemporalSpatial:dansSpatialPointY': '594708.97'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/5TMN6J', 'ore:describes': {'author': {'citation:authorName': 'Exaltus, R.P., & Tulp, C.'}, 'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '

In [50]:
# import json
# with open("../../data/sprint1/sp1_no_box_scheme.json", "w") as outfile: 
#     json.dump(data, outfile, indent=4)

In [6]:
# Print the frequency of the authors
from collections import Counter

authors = []

for result in data:
    try: 
        author = result['ore:describes']['author']['citation:authorName']
        authors.append(author)
    except TypeError: 
        continue


# Count the frequency of each author
author_counts = Counter(authors)
    

sorted_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
print("authors:")
for author, count in sorted_authors[:10]:
    print(f"{author}: {count}")

authors:
Boter, Reinoud: 158
Kroes, R.A.C.: 6
D. Schaars: 3
Schorn, E. A.: 2
Leuvering, J.H.F.: 2
Bulambo, F.M.: 2
Jong, C. de: 2
B.N.J. Verschuren: 2
G. Korenberg, F. Haans: 2
G. Korenberg: 2


## Identify cases where Points/Boxes are incomplete 

In [56]:
# Points 
points = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]

# Box
box = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]



In [57]:
# Note on the following query: 
# The * operator is used to unpack the list of dictionaries created by the list comprehension. 
# This is required because the $or operator expects a list of conditions, not a list of lists.

query = {
    "$and": [
        # At least one field exists in 'points' OR 'box'
        {
            "$or": [
                *[{field: {"$exists": True}} for field in points],  # Any field in 'points'
                *[{field: {"$exists": True}} for field in box]      # Any field in 'box'
            ]
        },
        # Exclude documents with ALL fields in 'points'
        {"$nor": [{field: {"$exists": True} for field in points}]},
        # Exclude documents with ALL fields in 'box'
        {"$nor": [{field: {"$exists": True} for field in box}]}
    ]
}


projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        'ore:describes.author.citation:authorName': 1,
        '_id': 0
    }

In [58]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/HD2TLR', 'ore:describes': {'author': [{'citation:authorName': 'Fens, RL'}, {'citation:authorName': 'Nater, CI'}], 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '201463', 'dansTemporalSpatial:dansSpatialBoxEast': '357111', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/FRDQG8', 'ore:describes': {'author': {'citation:authorName': 'P. Seinen'}, 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '121924', 'dansTemporalSpatial:dansSpatialBoxEast': '478749', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/JRFSCS', 'ore:describes': {'author

In [51]:
# import json
# with open("../../data/sprint1/sp1_incomplete.json", "w") as outfile: 
#     json.dump(data, outfile, indent=4)

In [92]:
# mioforj
# proof of concept  

In [21]:
record =     {
        "@id": "https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/79FSNP",
        "ore:describes": {
            "author": {
                "citation:authorName": "Yoshua Csonka"
            },
            "dansTemporalSpatial:dansSpatialPoint": {
                "dansTemporalSpatial:dansSpatialPointX": "245213.33",
                "dansTemporalSpatial:dansSpatialPointY": "594708.97"
            }
        }
    }

In [22]:
record

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/79FSNP',
 'ore:describes': {'author': {'citation:authorName': 'Yoshua Csonka'},
  'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '245213.33',
   'dansTemporalSpatial:dansSpatialPointY': '594708.97'}}}

In [23]:
format = re.match(r'(\d+\.\d{2}$)', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])

In [24]:
formatY

<re.Match object; span=(0, 9), match='245213.33'>

## PART 2: INFER

Infer the coordinate schemes for the instances where it is missing. 

In [42]:
import re 
for record in data:

    try:
        pointX = record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX']
        pointY = record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointY']

        # Check if the format is like 'XXX.XXX': 
        format = re.match(r'^\d+\.\d{3}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])
        if format:   

            # Add the inferred scheme (RD (in m.))
            record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "RD (in m.)"

            # Add a variable to show the data changed
            record['changed'] = True

        # Check for general RD format
        formatY = re.match(r'^\d{6}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointY'])
        if formatY:
            formatX = re.match(r'^\d{6}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])
            if formatX:
                # Add the inferred scheme (RD (in m.))
                record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "RD (in m.)"

                # Add a variable to show the data changed
                record['changed'] = True


        formatY = re.match(r'^\d{6}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointY'])
        if formatY:
            formatX = re.match(r'^\d{5}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])
            if formatX:
                # Add the inferred scheme (RD (in m.))
                record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "RD (in m.)"

                # Add a variable to show the data changed
                record['changed'] = True


        # Check for inverted long/lat format
        formatX = re.match(r'^\d{1}\.\d{5}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])
        if formatX:   
            formatY = re.match(r'^\d{2}\.\d{5}$', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointY'])
            if formatY: 
                # Add the inferred scheme (longitude/latitude (degrees))
                record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "longitude/latitude (degrees)"
                
                # Add a variable to show the data changed
                record['changed'] = True
                # Add variable to show a wrong order 
                record['incorrect'] = [True, "reversed X and Y"] 


        # Check if the format is like 'XXXXXX.XX': 
        formatX = re.match(r'(\d+\.\d{2}$)', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointX'])
        if formatX: 
            formatY = format = re.match(r'(\d+\.\d{2}$)', record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:dansSpatialPointY'])
            if formatY: 
                # Add the inferred scheme (RD)
                record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "RD (in m.)"
                # Add a variable to show the data changed
                record['changed'] = True


        # Check if the author is "Boter, Reinoud", making the inferred scheme "longitude/latitude (degrees)"
        if record['ore:describes']['author']['citation:authorName'] == "Boter, Reinoud":
            record['ore:describes']['dansTemporalSpatial:dansSpatialPoint']['dansTemporalSpatial:inferredScheme'] = "longitude/latitude (degrees)"
            # Add a variable to show the data changed
            record['changed'] = True


    except TypeError:
        continue



In [43]:
data

[{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/FAJGJ9',
  'ore:describes': {'author': {'citation:authorName': 'Schamp, C.R.C.'},
   'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '216.335',
    'dansTemporalSpatial:dansSpatialPointY': '597.730',
    'dansTemporalSpatial:inferredScheme': 'RD (in m.)'}},
  'changed': True},
 {'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/79FSNP',
  'ore:describes': {'author': {'citation:authorName': 'Yoshua Csonka'},
   'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '245213.33',
    'dansTemporalSpatial:dansSpatialPointY': '594708.97',
    'dansTemporalSpatial:inferredScheme': 'RD (in m.)'}},
  'changed': True},
 {'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/5

In [44]:
import json
with open("../../data/sprint1/sp1_point_inferred.json", "w") as outfile: 
    json.dump(data, outfile, indent=4)