# RCE - geographical metadata exploration 

## Explore

In [None]:
import pandas as pd
import zipfile

: 

In [None]:
### Inspect zip file contents 

# Path to zip file
zip_file_path = '../data/metadata-2.zip'
#zip_file_path = '../data/metadata-1.zip'

# Open the zip file and list its contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()  # List all files inside the zip
    print("Files in the ZIP archive:")
    for file in zip_contents:
        print(file)



In [None]:
# Paths to the zip files
zip_file_1 = '../data/metadata-1.zip'
zip_file_2 = '../data/metadata-2.zip'

In [None]:
# List to store DataFrames from both ZIPs
dfs = []

# Read CSV files from a ZIP file and add them to the list
def read_csv_from_zip(zip_file):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        # Get a list of all the files in the zip
        zip_contents = zip_ref.namelist()
        
        # Filter out only the CSV files
        csv_files = [f for f in zip_contents if f.endswith('.csv')]
        
        # Read each CSV file and append to the DataFrame list
        for csv_file in csv_files:
            with zip_ref.open(csv_file) as file:
                df = pd.read_csv(file)
                dfs.append(df)  # Append the DataFrame

# Read CSVs from both zip files
read_csv_from_zip(zip_file_1)
read_csv_from_zip(zip_file_2)

# Concatenate DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
combined_df.columns

In [None]:
combined_df.info()

In [None]:
# Inspect values of publicationStatus
pubstatus = combined_df.publicationStatus.value_counts()
print(pubstatus)

In [None]:
# Count missing values 
nan_counts = combined_df.isna().sum()


In [None]:
# Select archeological datasets
arch_df = combined_df[combined_df.dansSpatialPointX.notna() | combined_df.dansSpatialBoxNorth.notna()]


In [None]:
arch_df.info()

In [None]:
# Select only published datasets
df_pub = arch_df[arch_df.publicationStatus == 'Published']

In [None]:
df_pub.info()

In [None]:
# Make a list of DOIs
dois = df_pub.dsPersistentId.tolist()

In [None]:
dois[:10]

## Get OAI-ORE metadata

In [None]:
import requests
import json
import pprint

In [None]:
# URL of the JSON file
url = "https://dataverse.nl/api/datasets/export?exporter=OAI_ORE&persistentId=doi%3A10.34894/1TQV3K"

try:
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    response.raise_for_status()

    # Parse the JSON data
    data = response.json()

    # Optionally, save it to a file
    with open('../jsons/test.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print("JSON data has been saved to 'test.json'.")
    pprint.pprint(data) 

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")