# Exploration of JSONs 
Small random batch of 50 datasets

#### Use [Nominatim]() for mapping of placenames/streets to coordinates 


In [7]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="DANS-geomapping")
location = geolocator.geocode("Gruttersdijk 2A, Utrecht")
print(location.latitude, location.longitude)

52.0971657 5.117116


In [55]:
# Extract info from JSONs 
import json
import pandas as pd
import pprint
import os

json_files = []

for (root, dirs, file) in os.walk('../jsons'):
    for f in file:
        if f.endswith('.json') and f.startswith('nogeo'):
            json_files.append(os.path.join(root, f))


# List of JSON file paths
#json_files = ['../jsons/nogeo_doi:10.17026%dans-z5m-ktc9.json']




In [56]:
len(json_files)

110

In [69]:
# Create an empty list to store the data
data = []

# Loop through each JSON file
for file in json_files:
    with open(file, 'r') as f:
        json_data = json.load(f)
        
        # Extract the ID
        id = json_data.get('@id', None)
        if 'doi.org' in id: 
            id = id.split('doi.org/')[-1] # keep only the DOI part
        else: 
            id = id.split('doi:')[-1] 
        
        
        # Extract dataset descriptions, spatial coverage, and title
        describes = json_data.get('ore:describes', [])
        title = describes.get('title', None)

        if not isinstance(describes, list):
            describes = [describes]
        
        all_descriptions = []
        all_spatial_coverage = []
        for describe in describes:
            # Get dsDescriptionValue
            ds_description = describe.get('citation:dsDescription', [])
            if not isinstance(ds_description, list):
                ds_description = [ds_description]
            
            for desc in ds_description:
                descr = desc.get('citation:dsDescriptionValue', None)
                if descr:
                    all_descriptions.append(descr)
            
            # Get dansSpatialCoverageText
            spatial_coverage = describe.get('dansTemporalSpatial:dansSpatialCoverageText', [])
            if not isinstance(spatial_coverage, list):
                spatial_coverage = [spatial_coverage]
            
            for coverage in spatial_coverage:
                if coverage:
                    all_spatial_coverage.append(coverage)

        # Append the extracted data to the list
        data.append({
            'id': id,
            'title': title,
            'dsDescriptionValues': all_descriptions[0],
            'dansSpatialCoverageText': all_spatial_coverage
        })

# Create a DataFrame from the extracted data
df = pd.DataFrame(data)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       110 non-null    object
 1   title                    110 non-null    object
 2   dsDescriptionValues      110 non-null    object
 3   dansSpatialCoverageText  110 non-null    object
dtypes: object(4)
memory usage: 3.6+ KB


In [71]:
df.dsDescriptionValues = df.dsDescriptionValues.str.replace('<p>', '')
df.dsDescriptionValues = df.dsDescriptionValues.str.replace('</p>', '')

In [72]:
df

Unnamed: 0,id,title,dsDescriptionValues,dansSpatialCoverageText
0,10.17026/dans-zh2-6ube,PAN-00132785 - seal with connecting strip and pin,This find is registered at Portable Antiquitie...,"[Amsterdam, NLD]"
1,10.17026/dans-286-dy7s,"Enschede (OV) - Wesselerbrink, deelgebied 5",onderzoeksrapport,"[Nederland, Het Leunenberg, Wesselerbrinklaan,..."
2,10.17026/AR/WIPZRT,"PAN-00105621 - coin/coin-related, provincie, duit",This find is registered at Portable Antiquitie...,"[Katwijk, NLD]"
3,10.17026/dans-zdc-usah,PAN-00036502 - button with solid convex head a...,This find is registered at Portable Antiquitie...,"[Goes, NLD]"
4,10.17026/dans-xup-tcgu,"PAN-00005610 - coin/coin-related, (maker : Han...",This find is registered at Portable Antiquitie...,"[Amsterdam, NLD]"
...,...,...,...,...
105,10.17026/dans-28p-8nab,PAN-00074886 - tobacco box,This find is registered at Portable Antiquitie...,"[Vlissingen, NLD]"
106,10.17026/dans-x85-r3ux,PAN-00054775 - horseshoe type 3,This find is registered at Portable Antiquitie...,"[Maastricht, NLD]"
107,10.17026/dans-25h-m25t,PAN-00043998 - terminal knob (unspecified; Rom...,This find is registered at Portable Antiquitie...,"[Zaltbommel, NLD]"
108,10.17026/dans-za4-s88t,PAN-00123486 - net sinkers made from Roman cer...,This find is registered at Portable Antiquitie...,"[Zevenaar, NLD]"


In [67]:
df.to_csv('../data/missing_geo_test.csv', index=False)

In [None]:
geo_text = df.dansSpatialCoverageText.tolist()

In [80]:
print(geo_text[0])

['Amsterdam', 'NLD']
