# Exploration of JSONs 
Small random batch of 100 datasets

#### Use [Nominatim]() for mapping of placenames/streets to coordinates 


In [2]:
from scripts.json_to_df import extract_data
import pandas as pd

# Extract data from json files
data_nogeo = extract_data(prefix = 'nogeo')
df_nogeo = pd.DataFrame(data_nogeo)

In [3]:
df_nogeo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       110 non-null    object
 1   title                    110 non-null    object
 2   dsDescriptionValues      110 non-null    object
 3   dansSpatialCoverageText  110 non-null    object
dtypes: object(4)
memory usage: 3.6+ KB


In [4]:
df_nogeo.head()

Unnamed: 0,id,title,dsDescriptionValues,dansSpatialCoverageText
0,10.17026/dans-zh2-6ube,PAN-00132785 - seal with connecting strip and pin,This find is registered at Portable Antiquitie...,"[Amsterdam, NLD]"
1,10.17026/dans-286-dy7s,"Enschede (OV) - Wesselerbrink, deelgebied 5",onderzoeksrapport,"[Nederland, Het Leunenberg, Wesselerbrinklaan,..."
2,10.17026/AR/WIPZRT,"PAN-00105621 - coin/coin-related, provincie, duit",This find is registered at Portable Antiquitie...,"[Katwijk, NLD]"
3,10.17026/dans-zdc-usah,PAN-00036502 - button with solid convex head a...,This find is registered at Portable Antiquitie...,"[Goes, NLD]"
4,10.17026/dans-xup-tcgu,"PAN-00005610 - coin/coin-related, (maker : Han...",This find is registered at Portable Antiquitie...,"[Amsterdam, NLD]"


In [48]:
# df.dsDescriptionValues = df.dsDescriptionValues.str.replace('<p>', '')
# df.dsDescriptionValues = df.dsDescriptionValues.str.replace('</p>', '')

In [10]:
# df.to_csv('../data/missing_geo_test.csv', index=False)

In [11]:
geo_text = df_nogeo.dansSpatialCoverageText.tolist()

In [16]:
geo_text[:10]

[['Amsterdam', 'NLD'],
 ['Nederland',
  'Het Leunenberg',
  'Wesselerbrinklaan',
  'Wesselerbrink',
  'Enschede',
  'Overijssel',
  'e-ne (MARC21)'],
 ['Katwijk', 'NLD'],
 ['Goes', 'NLD'],
 ['Amsterdam', 'NLD'],
 ['Zaltbommel', 'NLD'],
 ['Vught', 'NLD'],
 ['Maasdriel', 'NLD'],
 ['Echt-Susteren', 'NLD'],
 ['Utrecht', 'NLD']]

In [21]:
import time
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="DANS-geomapping")

for text in geo_text[:100]:
    for entry in text: 
        print(entry)
        location = geolocator.geocode(entry)
        if location:
            print(location.latitude, location.longitude)
            print()
            time.sleep(1)
        else: 
            print("NOT FOUND: ", entry)
            print()


Amsterdam
52.3730796 4.8924534

NLD
27.443653849999997 -99.56786380287167

Nederland
52.2434979 5.6343227

Het Leunenberg
52.1927163 6.8840567

Wesselerbrinklaan
52.1968624 6.8785649

Wesselerbrink
52.191438649999995 6.875410720999296

Enschede
52.2209855 6.8940537

Overijssel
52.4254143 6.4610611

e-ne (MARC21)
NOT FOUND:  e-ne (MARC21)

Katwijk
52.18985485 4.414558264500643

NLD
27.443653849999997 -99.56786380287167

Goes
51.5031033 3.8897596

NLD
27.443653849999997 -99.56786380287167

Amsterdam
52.3730796 4.8924534

NLD
27.443653849999997 -99.56786380287167

Zaltbommel
51.782164 5.189826737158967

NLD
27.443653849999997 -99.56786380287167

Vught
51.6511806 5.23680014410728

NLD
27.443653849999997 -99.56786380287167

Maasdriel
51.7853378 5.316792505584603

NLD
27.443653849999997 -99.56786380287167

Echt-Susteren
51.082135199999996 5.899125547533698

NLD
27.443653849999997 -99.56786380287167

Utrecht
52.0907006 5.1215634

NLD
27.443653849999997 -99.56786380287167

Deventer
52.26948965

### Evaluation 

##### Step 1: Extract the full metadata JSONs for a random selection 

In [1]:
# Grab 250 random datapoints that have geospatial data 
import pandas as pd
df = pd.read_csv('../data/archaeology_metadata.csv')
df_geo = df[df['dansSpatialBoxNorth'].notnull() | df['dansSpatialPointX'].notnull()]
df_geo = df_geo.sample(250)

In [2]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 122292 to 147710
Data columns (total 13 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   dsPersistentId                                        250 non-null    object 
 1   publicationStatus                                     250 non-null    object 
 2   title                                                 250 non-null    object 
 3   dsDescriptionValue                                    250 non-null    object 
 4   dansSpatialPointX                                     237 non-null    object 
 5   dansSpatialPointY                                     237 non-null    object 
 6   dansSpatialPointScheme                                237 non-null    object 
 7   dansSpatialBoxNorth                                   15 non-null     object 
 8   dansSpatialBoxEast                                    15 

In [3]:
geo_dois = df_geo.dsPersistentId.tolist()

In [None]:
# from doi_to_json import get_json

# for doi in geo_dois:
#     get_json(doi)

##### Step 2: Apply geocoding package

In [2]:
from scripts.json_to_df import extract_data
import pandas as pd

# use the 'geo' prefix to get the data that contains geospatial information
data = extract_data(prefix='geo')
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,id,title,dsDescriptionValues,dansSpatialCoverageText,dansSpatialPoints,dansSpatialSchemes
0,10.17026/dans-z2t-fr88,Beerseweg 6 Oirschot,KSP Archeologie heeft een archeologisch bureau...,"[Oirschot, Noord-Brabant]","148651, 389591",RD (in m.)
1,10.17026/dans-xx7-w66k,NABO - locatie Beesel,Om het aantal incidenten op overwegen te vermi...,"[Beesel, Gemeente Beesel, Limburg]","202395, 364385",RD (in m.)
2,10.17026/dans-xdh-uzt5,"Bloemendaal, Brederodelaan 34 Gemeente Bloemen...",In februari 2015 is een archeologisch vooronde...,"[Brederodelaan 34, Bloemendaal, Gemeente Bloem...","102990, 492811",RD (in m.)
3,10.17026/dans-zeu-pe5h,"Locatie 'Dr. van Noortstraat 6' te Lienden, ge...",onderzoeksrapport,"[Dr. van Noortstraat, Nederland, Buren, Liende...","164000, 439847",RD (in m.)
4,10.17026/dans-z3c-p6t9,"bbbley 1, 2, 3 en 4 (Leyweg)",4 projecten samengevoegd: Leyweg 1412-1482 bor...,"[Den Haag, 's-Gravenhage, Zuid-Holland, Leyweg...","78919, 451961",RD (in m.)
...,...,...,...,...,...,...
245,10.17026/dans-24d-su5u,Bureauonderzoek en Inventariserend Veldonderzo...,Met dit onderzoek is de archeologische verwach...,"[Kampsestraat, Angeren, Gemeente Lingewaard, P...","194286, 436293",RD (in m.)
246,10.17026/AR/THR7DS,"Plangebied Resort Haamstede, gemeente Schouwen...",\nIn opdracht van CV Resort Haamstede UA heeft...,"[Daleboutsweg 4, Burgh, Gemeente Schouwen-Duiv...","40097, 412089",RD (in m.)
247,10.17026/dans-zfj-v2ak,"Stadsrandzone Zuid te Enschede, Deelgebied 2 en 3",Synthegra heeft in opdracht van de Gemeente En...,"[34F, Stadsrandzone Zuid, Enschede, Overijssel]","254892, 467617",RD (in m.)
248,10.17026/dans-zk6-dzdy,Een inventariserend archeologisch veldonderzoe...,Op 31 augustus 2004 is aan de Oosterseveldweg ...,[Weststellingwerf; Oosterstreek; Oosterseveldw...,"207900, 544400",RD (in m.)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       250 non-null    object
 1   title                    250 non-null    object
 2   dsDescriptionValues      250 non-null    object
 3   dansSpatialCoverageText  250 non-null    object
 4   dansSpatialPoints        250 non-null    object
 5   dansSpatialSchemes       237 non-null    object
dtypes: object(6)
memory usage: 11.8+ KB


In [5]:
texts = df.dansSpatialCoverageText.tolist()
for text in texts[:5]: 
    print(text)
    print(type(text))
    print()

['Oirschot', 'Noord-Brabant']
<class 'list'>

['Beesel', 'Gemeente Beesel', 'Limburg']
<class 'list'>

['Brederodelaan 34', 'Bloemendaal', 'Gemeente Bloemendaal', 'Noord-Holland']
<class 'list'>

['Dr. van Noortstraat', 'Nederland', 'Buren', 'Lienden', 'Gelderland', 'e-ne (MARC21)']
<class 'list'>

['Den Haag', "'s-Gravenhage", 'Zuid-Holland', 'Leyweg, Genemuidenstraat, Zuidlarenstraat']
<class 'list'>



In [24]:
import time
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="DANS-geomapping")


all_coords = []

for text in texts: 
    text_coords = []
    for entry in text: 
        location = geolocator.geocode(entry)
        if location:
            text_coords.append((location.latitude, location.longitude)) 
            time.sleep(1.3)
        else: 
            text_coords.append(f"NOT FOUND: {entry}")
            time.sleep(1.3)
    all_coords.append(text_coords)


GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Staphorst&format=json&limit=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7aea443a2d80>: Failed to resolve 'nominatim.openstreetmap.org' ([Errno -2] Name or service not known)"))

In [25]:
print(len(all_coords))

218


In [33]:
all_coords

[[(51.488961599999996, 5.300594979407693), (51.6017723, 5.4441391)],
 [(51.26998845, 6.073401860697201),
  'NOT FOUND: Gemeente Beesel',
  (51.2015196, 5.9046302)],
 [(52.2699055, 6.135358),
  (52.3742917, 4.591534676879699),
  (52.3930017, 4.5957038319879455),
  (52.7212825, 4.820665)],
 [(52.095804351429294, 4.47140172755453),
  (52.2434979, 5.6343227),
  (51.92963705, 5.420000238944224),
  (51.9457517, 5.5152908),
  (52.1014041, 5.9515701),
  'NOT FOUND: e-ne (MARC21)'],
 [(52.07494555, 4.2696802205364515),
  (52.0799838, 4.3113461),
  (51.9966792, 4.5597397),
  'NOT FOUND: Leyweg, Genemuidenstraat, Zuidlarenstraat'],
 ['NOT FOUND: Gemeente Lingewaard',
  (51.8915382, 5.8984809),
  (51.883676960323236, 5.879734666963888),
  (52.1014041, 5.9515701)],
 [],
 [(51.9966792, 4.5597397),
  (52.193046249999995, 4.621636649034843),
  (52.2239922, 4.6724358),
  (52.21395582729452, 4.6475165647645555),
  'NOT FOUND: 31A (kaartblad)'],
 [(53.04309255, 4.9349507674219355),
  (53.084180200000006,

In [27]:
df = df[:218]

In [30]:
df['pred_coordinate'] = all_coords

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pred_coordinate'] = all_coords


In [31]:
df

Unnamed: 0,id,title,dsDescriptionValues,dansSpatialCoverageText,dansSpatialPoints,dansSpatialSchemes,pred_coordinate
0,10.17026/dans-z2t-fr88,Beerseweg 6 Oirschot,KSP Archeologie heeft een archeologisch bureau...,"[Oirschot, Noord-Brabant]","148651, 389591",RD (in m.),"[(51.488961599999996, 5.300594979407693), (51...."
1,10.17026/dans-xx7-w66k,NABO - locatie Beesel,Om het aantal incidenten op overwegen te vermi...,"[Beesel, Gemeente Beesel, Limburg]","202395, 364385",RD (in m.),"[(51.26998845, 6.073401860697201), NOT FOUND: ..."
2,10.17026/dans-xdh-uzt5,"Bloemendaal, Brederodelaan 34 Gemeente Bloemen...",In februari 2015 is een archeologisch vooronde...,"[Brederodelaan 34, Bloemendaal, Gemeente Bloem...","102990, 492811",RD (in m.),"[(52.2699055, 6.135358), (52.3742917, 4.591534..."
3,10.17026/dans-zeu-pe5h,"Locatie 'Dr. van Noortstraat 6' te Lienden, ge...",onderzoeksrapport,"[Dr. van Noortstraat, Nederland, Buren, Liende...","164000, 439847",RD (in m.),"[(52.095804351429294, 4.47140172755453), (52.2..."
4,10.17026/dans-z3c-p6t9,"bbbley 1, 2, 3 en 4 (Leyweg)",4 projecten samengevoegd: Leyweg 1412-1482 bor...,"[Den Haag, 's-Gravenhage, Zuid-Holland, Leyweg...","78919, 451961",RD (in m.),"[(52.07494555, 4.2696802205364515), (52.079983..."
...,...,...,...,...,...,...,...
213,10.17026/AR/3NN2CO,"Janninkcomplex Haaksbergerstraat 147, Enschede.",Onderzoek naar de uit 1900-1901 daterende kato...,"[Haaksbergerstraat 147, 7513 EL Enschede, Rijk...","6.88671, 52.21384",longitude/latitude (degrees),"[(52.213835, 6.8866696), NOT FOUND: Rijksmonum..."
214,10.17026/dans-zxj-k3yb,Archeologisch bureauonderzoek Ugchelseweg 78 t...,Uit het bureauonderzoek blijkt dat er terreind...,"[33B, Ugchelseweg 78, Apeldoorn, Gelderland]","192747, 466636",RD (in m.),"[(-22.03348025, -59.9783108380642), (52.187238..."
215,10.17026/dans-zkw-qwn8,Westdorpe Molenstraat. Gemeente Terneuzen.,Op basis van de beschikbare informatie over de...,"[Zeeland, Gemeente Terneuzen, Westdorpe, Molen...","47235, 360067",RD (in m.),"[(51.4162975, 3.7028061), (51.331782399999994,..."
216,10.17026/dans-zt5-ayk6,Gameren 't Slot' (gemeente Zaltbommel),onderzoeksrapport,"[t Slot, Nederland, Gameren, Gelderland, 45A, ...","142220, 423710",RD (in m.),"[(52.7522238, 5.0759089), (52.2434979, 5.63432..."


In [32]:
df.to_csv('../data/eval_spatialCoverageText.csv', index=False)