# Explore the datasets that are not PAN 
- Find out the distribution/population of the geospatial metadata fields 
- Can we directly use GeoNames to get the entities? 
- Between step: add administrative levels 

In [77]:
import sys
# Specify the path to the scripts folder
sys.path.append('../scripts/')
from mongodb import load_data

In [78]:
collection = load_data()

In [79]:
# This query selects all records that are not PAN and have no geospatial information 
query = {
    "$and": [
        {"ore:describes.dansTemporalSpatial:dansSpatialPoint": {"$exists": False}},
        {"ore:describes.dansTemporalSpatial:dansSpatialBox": {"$exists": False}},
        # condition that the author field is not PAN: 
        {"ore:describes.author.citation:authorName": {"$ne": "Portable Antiquities of the Netherlands"}}
    ]
}

In [80]:
nopan = collection.find(query)

In [81]:
nopan.count()

  nopan.count()


5878

In [82]:
type(nopan)

pymongo.cursor.Cursor

In [83]:
texts = []
ids = []

for doc in nopan: 
    id = doc["_id"]
    try: 
        #print(id, doc["ore:describes"]["dansTemporalSpatial:dansSpatialCoverageText"])
        dansSpatialCoverageText = doc["ore:describes"]["dansTemporalSpatial:dansSpatialCoverageText"]
        texts.append(dansSpatialCoverageText)
        ids.append(id)
    except KeyError:
        #print(id, "No spatial coverage text available")
        dansSpatialCoverageText = None
        texts.append(dansSpatialCoverageText)
        ids.append(id)


In [85]:
import pandas as pd

df = pd.DataFrame(list(zip(ids, texts)), columns =['id', 'dansSpatialCoverageText'])

In [86]:
df

Unnamed: 0,id,dansSpatialCoverageText
0,67a63bebb0aa0e074bbe3569,"[Zuid-Holland, Nederland, Rotterdam, Noordsing..."
1,67a63bebb0aa0e074bbe357c,"[Nederland, Noorddijk, Zuid-Holland, Bernisse,..."
2,67a63bf1b0aa0e074bbe35a0,"[Rotterdam, Zuid-Holland, Lage Limiet, Nederla..."
3,67a63bf1b0aa0e074bbe35a3,"[Nesserlaan, Witmarsum, Gemeente Sudwest Frysl..."
4,67a63c04b0aa0e074bbe362c,Kloetinge
...,...,...
5873,67a9c4ddec45b2e66fe6fb56,"[Noord-Brabant, Zundert en Etten-Leur, Etten-L..."
5874,67a9c4ddec45b2e66fe6fb5c,"[Drenthe, Tynaarlo, Oudemolen, Groot Boerenbos]"
5875,67a9c4ddec45b2e66fe6fb6c,"[Noord-Brabant, Gilze en Rijen, Gilze, Burgeme..."
5876,67a9c4ddec45b2e66fe6fb6d,"[Drenthe, Noordenveld en Tynaarlo, Peize en Ee..."


In [88]:
df.to_csv("../../data/explore/nopan_no_spatial_info.csv", index=False)