# RCE - geographical metadata exploration 

## Explore

In [10]:
import pandas as pd
import zipfile

In [11]:
### Inspect zip file contents 

# Path to zip file
zip_file_path = '../data/metadata-2.zip'
#zip_file_path = '../data/metadata-1.zip'

# Open the zip file and list its contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()  # List all files inside the zip
    print("Files in the ZIP archive:")
    for file in zip_contents:
        print(file)



Files in the ZIP archive:
metadata-16-01-2025/
metadata-16-01-2025/getmetadata.sh
metadata-16-01-2025/metadata-from-prod-server-0.csv
metadata-16-01-2025/metadata-from-prod-server-1.csv
metadata-16-01-2025/metadata-from-prod-server-10.csv
metadata-16-01-2025/metadata-from-prod-server-11.csv
metadata-16-01-2025/metadata-from-prod-server-12.csv
metadata-16-01-2025/metadata-from-prod-server-13.csv
metadata-16-01-2025/metadata-from-prod-server-14.csv
metadata-16-01-2025/metadata-from-prod-server-15.csv
metadata-16-01-2025/metadata-from-prod-server-16.csv
metadata-16-01-2025/metadata-from-prod-server-17.csv
metadata-16-01-2025/metadata-from-prod-server-18.csv
metadata-16-01-2025/metadata-from-prod-server-19.csv
metadata-16-01-2025/metadata-from-prod-server-2.csv
metadata-16-01-2025/metadata-from-prod-server-20.csv
metadata-16-01-2025/metadata-from-prod-server-21.csv
metadata-16-01-2025/metadata-from-prod-server-22.csv
metadata-16-01-2025/metadata-from-prod-server-23.csv
metadata-16-01-2025/

In [12]:
# Paths to the zip files
zip_file_1 = '../data/metadata-1.zip'
zip_file_2 = '../data/metadata-2.zip'

In [13]:
# List to store DataFrames from both ZIPs
dfs = []

# Read CSV files from a ZIP file and add them to the list
def read_csv_from_zip(zip_file):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        # Get a list of all the files in the zip
        zip_contents = zip_ref.namelist()
        
        # Filter out only the CSV files
        csv_files = [f for f in zip_contents if f.endswith('.csv')]
        
        # Read each CSV file and append to the DataFrame list
        for csv_file in csv_files:
            with zip_ref.open(csv_file) as file:
                df = pd.read_csv(file)
                dfs.append(df)  # Append the DataFrame

# Read CSVs from both zip files
read_csv_from_zip(zip_file_1)
read_csv_from_zip(zip_file_2)

# Concatenate DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [14]:
combined_df

Unnamed: 0,dsPersistentId,publicationStatus,title,dsDescriptionValue,dccd-projectType,dccd-category,dccd-taxon,dccd-objectType,dccd-elementType,dccd-latitude,...,metadata_type_ss,dansSpatialPointX,dansSpatialPointY,dansSpatialPointScheme,dansSpatialBoxNorth,dansSpatialBoxEast,dansSpatialBoxSouth,dansSpatialBoxWest,dansSpatialBoxScheme,dansSpatialCoverageControlleddansSpatialCoverageText
0,doi:10.34894/JBUI0T,Published,Effect of surgical margin status after radical...,,,,,,,,...,"citation,dansDataVaultMetadata",,,,,,,,,
1,doi:10.34894/0LXXER,Published,"Dietary changes and dietary supplement use, an...",,,,,,,,...,"citation,dansDataVaultMetadata",,,,,,,,,
2,doi:10.34894/BV1PBM,Published,Traditionele jongeren en onafhankelijke oudere...,,,,,,,,...,"citation,dansDataVaultMetadata",,,,,,,,,
3,doi:10.34894/ZYFNAM,Published,Impact of therapy and disease-related symptoms...,,,,,,,,...,"citation,dansDataVaultMetadata",,,,,,,,,
4,doi:10.34894/OIQJXX,Published,Elderly multiple myeloma patients experience l...,,,,,,,,...,"citation,dansDataVaultMetadata",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169700,doi:10.17026/AR/H4IDSE,Published,"IVO-karterende fase zonnepark Groot Roodehaan,...",Laagland Archeologie heeft in april 2021 een k...,,,,,,,...,,238853238979239404239437239175,579376579083579242579530579532,"RD (in m.),RD (in m.),RD (in m.),RD (in m.)",,,,,,
169701,doi:10.17026/AR/19BCUB,Published,Gemeente Moerdijk plangebied de Kogelvangers t...,Op basis van het bureauonderzoek geldt er voor...,,,,,,,...,,,,,411813,90282,411486,89850,RD (in m.),
169702,doi:10.17026/AR/QXOI5Q,Published,Archeologisch bureauonderzoek perceel tussen K...,In november 2021 is een archeologisch bureauon...,,,,,,,...,,101.391,497.243,RD (in m.),,,,,,
169703,doi:10.17026/AR/QHHAGE,Published,Archeologisch bureauonderzoek Den Ilp 53a te D...,In november 2021 is in opdracht van een partic...,,,,,,,...,,122.592,495.815,,,,,,,


In [15]:
combined_df.columns

Index(['dsPersistentId', 'publicationStatus', 'title', 'dsDescriptionValue',
       'dccd-projectType', 'dccd-category', 'dccd-taxon', 'dccd-objectType',
       'dccd-elementType', 'dccd-latitude', 'dccd-longitude',
       'dccd-periodStart', 'dccd-periodEnd', 'metadata_type_ss',
       'dansSpatialPointX', 'dansSpatialPointY', 'dansSpatialPointScheme',
       'dansSpatialBoxNorth', 'dansSpatialBoxEast', 'dansSpatialBoxSouth',
       'dansSpatialBoxWest', 'dansSpatialBoxScheme',
       'dansSpatialCoverageControlleddansSpatialCoverageText'],
      dtype='object')

In [16]:
# Inspect titles
titles = combined_df.title.tolist()

for title in titles[:10]: 
    print(title)
    print()

Effect of surgical margin status after radical prostatectomy on health-related quality of life and illness perception in patients with prostate cancer [Dataset]

Dietary changes and dietary supplement use, and underlying motives for these habits reported by colorectal cancer survivors of the Patient Reported Outcomes Following Initial Treatment and Long-Term Evaluation of Survivorship (PROFILES) registry [Dataset]

Traditionele jongeren en onafhankelijke ouderen [Dataset]

Impact of therapy and disease-related symptoms on health-related quality of life in patients with follicular lymphoma: Results of the population-based PHAROS-registry [Dataset]

Elderly multiple myeloma patients experience less deterioration in health-related quality of life than younger patients compared to a normative population: A study from the population-based PROFILES registry [Dataset]

Low subjective health literacy with adverse health behaviors and worse health-realted quality of life among colorectal cancer

In [17]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169705 entries, 0 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   dsPersistentId                                        169705 non-null  object 
 1   publicationStatus                                     169705 non-null  object 
 2   title                                                 169705 non-null  object 
 3   dsDescriptionValue                                    169389 non-null  object 
 4   dccd-projectType                                      5337 non-null    object 
 5   dccd-category                                         5337 non-null    object 
 6   dccd-taxon                                            5033 non-null    object 
 7   dccd-objectType                                       5117 non-null    object 
 8   dccd-elementType                            

In [18]:
# Inspect values of publicationStatus
pubstatus = combined_df.publicationStatus.value_counts()
print(pubstatus)

publicationStatus
Published                      166283
Unpublished,Draft                2796
Unpublished,Draft,In Review       247
Draft                             199
Deaccessioned                     174
Draft,In Review                     6
Name: count, dtype: int64


In [19]:
# Count missing values 
nan_counts = combined_df.isna().sum()


In [20]:
# Select archeological datasets
arch_df = combined_df[combined_df.dansSpatialPointX.notna() | combined_df.dansSpatialBoxNorth.notna()]


In [21]:
arch_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60313 entries, 10416 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   dsPersistentId                                        60313 non-null  object 
 1   publicationStatus                                     60313 non-null  object 
 2   title                                                 60313 non-null  object 
 3   dsDescriptionValue                                    60313 non-null  object 
 4   dccd-projectType                                      0 non-null      object 
 5   dccd-category                                         0 non-null      object 
 6   dccd-taxon                                            0 non-null      object 
 7   dccd-objectType                                       0 non-null      object 
 8   dccd-elementType                                      0 

In [22]:
# Select only published datasets
df_pub = arch_df[arch_df.publicationStatus == 'Published']

In [23]:
df_pub.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59837 entries, 10416 to 169704
Data columns (total 23 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   dsPersistentId                                        59837 non-null  object 
 1   publicationStatus                                     59837 non-null  object 
 2   title                                                 59837 non-null  object 
 3   dsDescriptionValue                                    59837 non-null  object 
 4   dccd-projectType                                      0 non-null      object 
 5   dccd-category                                         0 non-null      object 
 6   dccd-taxon                                            0 non-null      object 
 7   dccd-objectType                                       0 non-null      object 
 8   dccd-elementType                                      0 

In [24]:
# Make a list of DOIs
dois = df_pub.dsPersistentId.tolist()

In [25]:
dois[:10]

['doi:10.17026/dans-zrj-unr7',
 'doi:10.17026/dans-299-9dpm',
 'doi:10.17026/dans-zqy-ymw8',
 'doi:10.17026/dans-z8d-9c6h',
 'doi:10.17026/dans-x9v-j3qu',
 'doi:10.17026/dans-2xv-vrz6',
 'doi:10.17026/dans-zfx-fuem',
 'doi:10.17026/dans-z4f-4hp3',
 'doi:10.17026/dans-24x-uv4g',
 'doi:10.17026/dans-zm6-226h']

## Get OAI-ORE metadata

In [30]:
import requests
import json
import pprint

In [32]:
# URL of the JSON file
url = "https://dataverse.nl/api/datasets/export?exporter=OAI_ORE&persistentId=doi%3A10.34894/1TQV3K"

try:
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    response.raise_for_status()

    # Parse the JSON data
    data = response.json()

    # Optionally, save it to a file
    with open('../jsons/test.json', 'w') as json_file:
        json.dump(data, json_file, indent=4)

    print("JSON data has been saved to 'test.json'.")
    pprint.pprint(data) 

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

JSON data has been saved to 'test.json'.
{'@context': {'author': 'http://purl.org/dc/terms/creator',
              'authorIdentifier': 'http://purl.org/spar/datacite/AgentIdentifier',
              'authorIdentifierScheme': 'http://purl.org/spar/datacite/AgentIdentifierScheme',
              'citation': 'https://dataverse.org/schema/citation/',
              'content': '@value',
              'dansDataVaultMetadata': 'https://dataverse.nl/schema/dansDataVaultMetadata#',
              'dateOfDeposit': 'http://purl.org/dc/terms/dateSubmitted',
              'dccd': 'https://dataverse.nl/schema/dccd#',
              'dcterms': 'http://purl.org/dc/terms/',
              'dvcore': 'https://dataverse.org/schema/core#',
              'lang': '@language',
              'ore': 'http://www.openarchives.org/ore/terms/',
              'schema': 'http://schema.org/',
              'scheme': 'http://www.w3.org/2004/02/skos/core#inScheme',
              'subject': 'http://purl.org/dc/terms/subject',
