# RCE - geographical metadata exploration 

## Explore

In [2]:
import pandas as pd
import zipfile

In [3]:
### Inspect zip file contents 

# Path to zip file
zip_file_path = '../data/metadata-2.zip'
#zip_file_path = '../data/metadata-1.zip'

# Open the zip file and list its contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()  # List all files inside the zip
    print("Files in the ZIP archive:")
    for file in zip_contents:
        print(file)



Files in the ZIP archive:
metadata-16-01-2025/
metadata-16-01-2025/getmetadata.sh
metadata-16-01-2025/metadata-from-prod-server-0.csv
metadata-16-01-2025/metadata-from-prod-server-1.csv
metadata-16-01-2025/metadata-from-prod-server-10.csv
metadata-16-01-2025/metadata-from-prod-server-11.csv
metadata-16-01-2025/metadata-from-prod-server-12.csv
metadata-16-01-2025/metadata-from-prod-server-13.csv
metadata-16-01-2025/metadata-from-prod-server-14.csv
metadata-16-01-2025/metadata-from-prod-server-15.csv
metadata-16-01-2025/metadata-from-prod-server-16.csv
metadata-16-01-2025/metadata-from-prod-server-17.csv
metadata-16-01-2025/metadata-from-prod-server-18.csv
metadata-16-01-2025/metadata-from-prod-server-19.csv
metadata-16-01-2025/metadata-from-prod-server-2.csv
metadata-16-01-2025/metadata-from-prod-server-20.csv
metadata-16-01-2025/metadata-from-prod-server-21.csv
metadata-16-01-2025/metadata-from-prod-server-22.csv
metadata-16-01-2025/metadata-from-prod-server-23.csv
metadata-16-01-2025/

In [4]:
# Paths to the zip files
zip_file_1 = '../data/metadata-1.zip'
zip_file_2 = '../data/metadata-2.zip'

In [5]:
# List to store DataFrames from both ZIPs
dfs = []

# Read CSV files from a ZIP file and add them to the list
def read_csv_from_zip(zip_file):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        # Get a list of all the files in the zip
        zip_contents = zip_ref.namelist()
        
        # Filter out only the CSV files
        csv_files = [f for f in zip_contents if f.endswith('.csv')]
        
        # Read each CSV file and append to the DataFrame list
        for csv_file in csv_files:
            with zip_ref.open(csv_file) as file:
                df = pd.read_csv(file)
                dfs.append(df)  # Append the DataFrame

# Read CSVs from both zip files
read_csv_from_zip(zip_file_1)
read_csv_from_zip(zip_file_2)

# Concatenate DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [6]:
combined_df.columns

Index(['dsPersistentId', 'publicationStatus', 'title', 'dsDescriptionValue',
       'dccd-projectType', 'dccd-category', 'dccd-taxon', 'dccd-objectType',
       'dccd-elementType', 'dccd-latitude', 'dccd-longitude',
       'dccd-periodStart', 'dccd-periodEnd', 'metadata_type_ss',
       'dansSpatialPointX', 'dansSpatialPointY', 'dansSpatialPointScheme',
       'dansSpatialBoxNorth', 'dansSpatialBoxEast', 'dansSpatialBoxSouth',
       'dansSpatialBoxWest', 'dansSpatialBoxScheme',
       'dansSpatialCoverageControlleddansSpatialCoverageText'],
      dtype='object')

In [7]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169705 entries, 0 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   dsPersistentId                                        169705 non-null  object 
 1   publicationStatus                                     169705 non-null  object 
 2   title                                                 169705 non-null  object 
 3   dsDescriptionValue                                    169389 non-null  object 
 4   dccd-projectType                                      5337 non-null    object 
 5   dccd-category                                         5337 non-null    object 
 6   dccd-taxon                                            5033 non-null    object 
 7   dccd-objectType                                       5117 non-null    object 
 8   dccd-elementType                            

In [8]:
# Inspect values of publicationStatus
pubstatus = combined_df.publicationStatus.value_counts()
print(pubstatus)

publicationStatus
Published                      166283
Unpublished,Draft                2796
Unpublished,Draft,In Review       247
Draft                             199
Deaccessioned                     174
Draft,In Review                     6
Name: count, dtype: int64


In [9]:
# Count missing values 
nan_counts = combined_df.isna().sum()


In [10]:
# Select archeological datasets
arch_df = combined_df[combined_df.dansSpatialPointX.notna() | combined_df.dansSpatialBoxNorth.notna()]


In [11]:
arch_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60313 entries, 10416 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   dsPersistentId                                        60313 non-null  object 
 1   publicationStatus                                     60313 non-null  object 
 2   title                                                 60313 non-null  object 
 3   dsDescriptionValue                                    60313 non-null  object 
 4   dccd-projectType                                      0 non-null      object 
 5   dccd-category                                         0 non-null      object 
 6   dccd-taxon                                            0 non-null      object 
 7   dccd-objectType                                       0 non-null      object 
 8   dccd-elementType                                      0 

In [12]:
# Select only published datasets
df_pub = arch_df[arch_df.publicationStatus == 'Published']

In [13]:
df_pub.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59837 entries, 10416 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   dsPersistentId                                        59837 non-null  object 
 1   publicationStatus                                     59837 non-null  object 
 2   title                                                 59837 non-null  object 
 3   dsDescriptionValue                                    59837 non-null  object 
 4   dccd-projectType                                      0 non-null      object 
 5   dccd-category                                         0 non-null      object 
 6   dccd-taxon                                            0 non-null      object 
 7   dccd-objectType                                       0 non-null      object 
 8   dccd-elementType                                      0 

In [14]:
# Make a list of DOIs
dois = df_pub.dsPersistentId.tolist()

In [19]:
dois[1000:1010]

['doi:10.17026/dans-zpq-esut',
 'doi:10.17026/dans-xcv-5ezh',
 'doi:10.17026/dans-xaw-aezr',
 'doi:10.17026/dans-xjw-5j2s',
 'doi:10.17026/dans-zuh-n6ew',
 'doi:10.17026/dans-xen-ce72',
 'doi:10.17026/dans-xjw-fpfz',
 'doi:10.17026/dans-z78-xsm7',
 'doi:10.17026/dans-zed-xz3e',
 'doi:10.17026/dans-xby-cgqk']

In [21]:
test_doi = dois[0]
test_doi

'doi:10.17026/dans-zrj-unr7'

## Get OAI-ORE metadata

In [16]:
import requests
import json
import pprint

In [26]:
# URL of the JSON file
url = f"https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId={test_doi}"

try:
    # Send a GET request to the URL
    response = requests.get(url)
    print(url)

    # Check if the request was successful
    response.raise_for_status()

    # Parse the JSON data
    data = response.json()

    # Optionally, save it to a file
    with open('../jsons/test.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print("JSON data has been saved to 'test.json'.")
    # pprint.pprint(data) 

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

https://archaeology.datastations.nl/api/datasets/export?exporter=OAI_ORE&persistentId=doi:10.17026/dans-zrj-unr7
JSON data has been saved to 'test.json'.
