# Sprint 1: Identify missing coordinate schemes 


## Read all data 
Make sure the collected data is up-to-date. Run [`find_new_deposits.ipynb`](../collect_data/find_new_deposits.ipynb) and [`collect_metadata.ipynb`](../collect_data/collect_metadata.ipynb) first to download metadata until current date. 

In [51]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [52]:
collection = load_data()


In [27]:
#scheme = "longitude/latitude (degrees)"
scheme = "RD (in m.)"

sample = collection.find({"ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme":f"{scheme}"}, {"_id": 0, "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX": 1, "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY": 1 ,"ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme": 1}).limit(5)

In [28]:
for l in sample: 
    print(l)
    print()


{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '108240', 'dansTemporalSpatial:dansSpatialPointY': '447370', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '77230', 'dansTemporalSpatial:dansSpatialPointY': '490214', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '203505', 'dansTemporalSpatial:dansSpatialPointY': '324100', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '95873', 'dansTemporalSpatial:dansSpatialPointY': '421799', 'dansTemporalSpatial:dansSpatialPointScheme': 'RD (in m.)'}}}

{'ore:describes': {'dansTemporalSpatial:dansSpatialPoint': [{'dansTemporalSpatial:dansSpatialPointX': 

## PART 1: IDENTIFY

## Identify instances where the Scheme is missing but not the points. 

In [63]:
# # For Points: 
# fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]
# fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme"]

# For Box: 
fields_present = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]
fields_absent = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme"]

In [64]:
# Select the documents that contain a value for at least one of the fields in fields_present (points + boxes), and do not contain a value specifying the coordinate scheme 
query = {
    "$and":[
        {"$or": [{x: {"$exists": True} for x in fields_present}]},
        {"$and": [{x: {"$exists": False} for x in fields_absent}]}
    ], 
}

# Select the fields to be included in the output
projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        'ore:describes.author.citation:authorName': 1,
        '_id': 0
    }



In [65]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/JIHOJG', 'ore:describes': {'author': [{'citation:authorName': 'Kruithof, L.'}, {'citation:authorName': 'Hagens, D.'}], 'dansTemporalSpatial:dansSpatialPoint': {'dansTemporalSpatial:dansSpatialPointX': '148345', 'dansTemporalSpatial:dansSpatialPointY': '402715'}, 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '402735', 'dansTemporalSpatial:dansSpatialBoxEast': '148400', 'dansTemporalSpatial:dansSpatialBoxSouth': '402686', 'dansTemporalSpatial:dansSpatialBoxWest': '148311'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/SWIA0R', 'ore:describes': {'author': {'citation:authorName': 'Bongers, J.'}, 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '601581', 'dansTemporalSpatial:dansSpatialBoxEast': '225187', 'dansTemporalSpatia

In [50]:
# import json
# with open("../../data/sprint1/sp1_no_box_scheme.json", "w") as outfile: 
#     json.dump(data, outfile, indent=4)

In [66]:
# Read json as dataframe 
import pandas as pd
df = pd.DataFrame(data)
df.to_csv("../../data/sprint1/sp1_no_box_scheme.csv", index=False)

In [49]:
print(len(data))

3


In [91]:
# Print the frequency of the authors
from collections import Counter

authors = []

for result in data:
    try: 
        author = result['ore:describes']['author']['citation:authorName']
        authors.append(author)
    except TypeError: 
        continue


# Count the frequency of each author
author_counts = Counter(authors)
    

sorted_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
print("authors:")
for author, count in sorted_authors:
    print(f"{author}: {count}")

authors:
Boter, Reinoud: 158
Kroes, R.A.C.: 6
D. Schaars: 3
Schorn, E. A.: 2
Leuvering, J.H.F.: 2
Bulambo, F.M.: 2
Jong, C. de: 2
B.N.J. Verschuren: 2
G. Korenberg, F. Haans: 2
G. Korenberg: 2
F. Haans, D. Schaars: 2
L. Kleij: 2
Schiltmans, D.E.A.: 2
Schamp, C.R.C.: 1
Yoshua Csonka: 1
Exaltus, R.P., & Tulp, C.: 1
A.G.J. Hullegie: 1
Rijk, de, T.E.: 1
Modderkolk, M.W.J.: 1
Verhoeven, M. Dr.: 1
Hullegie, A.G.J.: 1
Geraeds, J.J.G.: 1
Boer, de, A. G.: 1
C. Brühl: 1
Kroes, R.A.C: 1
I. Korver: 1
Biggelaar, van den, D.F.A.M.: 1
C. van der Esch: 1
Jans, J.E.A.: 1
Iris de Fuijk: 1
J.J. Hekman: 1
A.T.L.E van Bussel: 1
C.R.C. Schamp: 1
K.T. Salomons,: 1
S.M. Koeman: 1
T. Nales: 1
J. van der Kroon: 1
Steekproef, De: 1
Westra, F.P.: 1
Vosselman, J.: 1
Vaessen, Dr. R.A.: 1
Wijnen. J.: 1
Groot, R. de: 1
Zielman, G.: 1
Ouwerkerk, L.P.: 1
J.J.G. Geraeds: 1
N.Vlieks/J.J.G. Geraeds: 1
Bakker, S.: 1
J. de Munnik: 1
J.P.L. Vaars: 1
de Ridder, J.A.A.: 1
van Boldrik, J.M.L.: 1
E. A. Schorn: 1
Moerman, S.: 1
D

## Identify cases where Points/Boxes are incomplete 

In [56]:
# Points 
points = ["ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX", "ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY"]

# Box
box = ["ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast", "ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest"]



In [57]:
# Note on the following query: 
# The * operator is used to unpack the list of dictionaries created by the list comprehension. 
# This is required because the $or operator expects a list of conditions, not a list of lists.

query = {
    "$and": [
        # At least one field exists in 'points' OR 'box'
        {
            "$or": [
                *[{field: {"$exists": True}} for field in points],  # Any field in 'points'
                *[{field: {"$exists": True}} for field in box]      # Any field in 'box'
            ]
        },
        # Exclude documents with ALL fields in 'points'
        {"$nor": [{field: {"$exists": True} for field in points}]},
        # Exclude documents with ALL fields in 'box'
        {"$nor": [{field: {"$exists": True} for field in box}]}
    ]
}


projection = {
        '@id': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointX': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointY': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialPoint.dansTemporalSpatial:dansSpatialPointScheme': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxNorth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxSouth': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxEast': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxWest': 1,
        'ore:describes.dansTemporalSpatial:dansSpatialBox.dansTemporalSpatial:dansSpatialBoxScheme': 1,
        'ore:describes.author.citation:authorName': 1,
        '_id': 0
    }

In [58]:
results = collection.find(query, projection)
data = []
for result in results:
    print(result)
    data.append(dict(result))

{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/HD2TLR', 'ore:describes': {'author': [{'citation:authorName': 'Fens, RL'}, {'citation:authorName': 'Nater, CI'}], 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '201463', 'dansTemporalSpatial:dansSpatialBoxEast': '357111', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/FRDQG8', 'ore:describes': {'author': {'citation:authorName': 'P. Seinen'}, 'dansTemporalSpatial:dansSpatialBox': {'dansTemporalSpatial:dansSpatialBoxNorth': '121924', 'dansTemporalSpatial:dansSpatialBoxEast': '478749', 'dansTemporalSpatial:dansSpatialBoxScheme': 'RD (in m.)'}}}
{'@id': 'https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.17026/AR/JRFSCS', 'ore:describes': {'author

In [51]:
# import json
# with open("../../data/sprint1/sp1_incomplete.json", "w") as outfile: 
#     json.dump(data, outfile, indent=4)

In [92]:
# mioforj
# proof of concept  

## PART 2: INFER

Infer the coordinate schemes for the instances where it is missing. 