# Explore metadata fields in the full archaeology dataset

### Load data
Here we make a connection to the MongoDB database where the JSONs are saved. They include the full metadata of all datasets in the Archaeology Data Station. 

In [32]:
from pymongo import MongoClient

# Replace with your MongoDB connection string
CONNECTION_STRING = "mongodb://127.0.0.1:27018"  # For local MongoDB

# Connect to the database and collection
client = MongoClient(CONNECTION_STRING)
db = client["archaeology_metadata"] # Create new database
collection = db["collection"] # Create new collection

### Inspect data
Let's get an idea of what's in the metadata. First we count the total number of datasets, then we see many datasets have values for a selection of fields that may include geospatial data. 

In [53]:
count = collection.count_documents({})
print(f"Total documents in collection: {count}")

Total documents in collection: 158254


In [56]:
# This function counts the number of documents that have the specified field in the metadata

def get_count(path):

    """ 
    Print the count of the documents that have the specified field in the collection

    :param path: The path to the field in the document 
    """

    pipeline = [
        { "$match": { f"{path}": { "$exists": True, "$ne": None } } },
        { "$count": "total" }]  

    result = list(collection.aggregate(pipeline))
    count = result[0]["total"] if result else 0

    print(f"{path.split('.')[-1]}: {count}")

    #return count

In [55]:
# Get the count of documents for various fields
get_count("ore:describes.dansTemporalSpatial:dansSpatialCoverageText")
get_count("ore:describes.dansTemporalSpatial:dansSpatialPoint")
get_count("ore:describes.dansTemporalSpatial:dansSpatialBox")
get_count("ore:describes.dansTemporalSpatial:dansSpatialCoverageControlled")
get_count("ore:describes.citation:dsDescription")
get_count("ore:describes.ore:aggregates.schema:name")
get_count("ore:describes.dansRelationMetadata:dansCollection")
get_count("ore:describes.citation:distributor") 



dansTemporalSpatial:dansSpatialCoverageText: 154396
dansTemporalSpatial:dansSpatialPoint: 56485
dansTemporalSpatial:dansSpatialBox: 4445
dansTemporalSpatial:dansSpatialCoverageControlled: 8274
citation:dsDescription: 158254
schema:name: 158253
dansRelationMetadata:dansCollection: 43907
citation:distributor: 50123


### Take a look at the datasets with missing geospatial data

In [51]:
# Count the number of documents where either geospatial field is missing
query = {
    "$and": [
        {"ore:describes.dansTemporalSpatial:dansSpatialPoint": {"$exists": False}},
        {"ore:describes.dansTemporalSpatial:dansSpatialBox": {"$exists": False}}
    ]}

# Count the documents that match the query
missing_fields_count = collection.count_documents(query)

# Print the result
print(f" {missing_fields_count}")




 98412


In [43]:
# Check counts of distinct values in the field "citation:distributor"
pipeline = [
    { "$group": { "_id": "$ore:describes.citation:distributor", "count": { "$sum": 1 } } },
    { "$sort": { "count": -1 } }
]

result = list(collection.aggregate(pipeline))
result = sorted(result, key=lambda x: x["count"], reverse=True)


In [44]:
result

[{'_id': None, 'count': 108131},
 {'_id': {'citation:distributorName': 'RAAP Archeologisch Adviesbureau'},
  'count': 5077},
 {'_id': {'citation:distributorName': 'ADC ArcheoProjecten'}, 'count': 3943},
 {'_id': {'citation:distributorName': 'BAAC bv'}, 'count': 2530},
 {'_id': {'citation:distributorName': 'De Steekproef'}, 'count': 1640},
 {'_id': {'citation:distributorName': 'SOB Research'}, 'count': 1475},
 {'_id': {'citation:distributorName': 'RAAP Archeologisch Adviesbureau BV'},
  'count': 1246},
 {'_id': {'citation:distributorName': 'Econsultancy'}, 'count': 1155},
 {'_id': {'citation:distributorName': 'IDDS Archeologie'}, 'count': 1059},
 {'_id': {'citation:distributorName': 'Laagland Archeologie'}, 'count': 965},
 {'_id': {'citation:distributorName': 'Transect'}, 'count': 938},
 {'_id': {'citation:distributorName': 'RAAP Archeologisch Adviesbureau B.V.'},
  'count': 920},
 {'_id': {'citation:distributorName': 'ARC bv'}, 'count': 912},
 {'_id': {'citation:distributorName': 'RAAP