<a href="https://colab.research.google.com/github/CSUC/RDR-scripts/blob/main/extract_metadata/extract_metadata_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extreure metadades a un fitxer tabular

OBSERVACIÓ: Si teniu dubtes del codi, contactar amb rdr-contacte@csuc.cat

## OBJECTIU DE L'SCRIPT

L'objectiu principal d'aquest script és exportar les metadades d'un dataset a un fitxer tabular CSV o excel.

## IMPORTANT: Només Emplenar les variables token i doi i executar tot

In [None]:
# @title First click the &#x25B6; button to execute the script. </p> Then, enter the token (If you don't have your API token, you can get it from the following link <a href='https://dataverse.csuc.cat/dataverseuser.xhtml?selectTab=apiTokenTab' target='_blank'>Get API Token</a>).</p> After that, enter the LAST DIGITS of the DOI (for example, if the DOI ends in <strong>dataXYZ</strong>, only write the number <strong>XYZ</strong> ).</p> Finally click <strong>Download</strong> to download the file.
import os
import subprocess
import sys

# Function to install required packages
def install_packages():
    """
    Function to install or update necessary Python packages.
    """
    # Upgrade pip first
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "-q"])

    # Install the required libraries
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyDataverse", "-q"])

    print("Libraries have been downloaded or updated.")

# Install libraries if they are not installed already
try:
    import pyDataverse
except ImportError:
    print("Installing libraries...")
    install_packages()

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

import ipywidgets as widgets
from IPython.display import display, FileLink
from google.colab import files
from IPython.display import display, HTML

from pyDataverse.api import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse
import pandas as pd

# Provide input values
token = ""  # @param {type:"string"}
identifier = ""  # @param {type:"string"}
base_url = "https://dataverse.csuc.cat/"
doi = 'doi:10.34810/data'+identifier

def extract_value(data_dict):
    """
    Function to extract all keys and values from a JSON metadata dictionary.

    Parameters:
    - data_dict: dict. JSON metadata dictionary.

    Returns:
    - type_names: list. List of type names extracted from the metadata.
    - values: list. List of values extracted from the metadata.
    """
    if isinstance(data_dict, dict):
        type_names = []
        values = []
        for key, value in data_dict.items():
            if key == 'typeName' and 'value' in data_dict:
                if isinstance(data_dict['value'], list):
                    for v in data_dict['value']:
                        type_names.append(data_dict['typeName'])
                        values.append(v)
                else:
                    type_names.append(data_dict['typeName'])
                    values.append(data_dict['value'])
            elif isinstance(value, dict) and 'typeName' in value and 'value' in value:
                type_names.append(value['typeName'])
                values.append(value['value'])
            elif isinstance(value, str) and key == 'typeName':
                type_names.append(value)
                values.append(value)
            else:
                extracted_type_names, extracted_values = extract_value(value)
                type_names += extracted_type_names
                values += extracted_values
        return type_names, values
    elif isinstance(data_dict, list):
        type_names = []
        values = []
        for item in data_dict:
            extracted_type_names, extracted_values = extract_value(item)
            type_names += extracted_type_names
            values += extracted_values
        return type_names, values
    else:
        return [], []

def exportmetadata(base_url, token, doi,
                   citation_keys, citation_values,
                   geo_keys, geo_values,
                   social_keys, social_values,
                   astronomy_keys, astronomy_values,
                   biomedical_keys, biomedical_values,
                   journal_keys, journal_values,
                   computationalworkflow_keys, computationalworkflow_values,
                   LocalContextsCVoc_keys, LocalContextsCVoc_values,
                   darwincore_keys, darwincore_values):
    """
    Export metadata from a Dataverse dataset using its DOI.

    Parameters:
        (same as original, see above)

    Returns:
        None. Populates provided lists with extracted metadata.
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    import os

    api = NativeApi(base_url, token)

    # Metadata blocks mapping: (block_name, keys_list, values_list)
    metadata_blocks = [
        ("citation", citation_keys, citation_values),
        ("geospatial", geo_keys, geo_values),
        ("socialscience", social_keys, social_values),
        ("astrophysics", astronomy_keys, astronomy_values),
        ("biomedical", biomedical_keys, biomedical_values),
        ("journal", journal_keys, journal_values),
        ("computationalworkflow", computationalworkflow_keys, computationalworkflow_values),
        ("LocalContextsCVoc", LocalContextsCVoc_keys, LocalContextsCVoc_values),
        ("darwincore", darwincore_keys, darwincore_values)
    ]

    try:
        dataset = api.get_dataset(doi)
        metadata = dataset.json()['data']['latestVersion']['metadataBlocks']

        for block_name, keys_list, values_list in metadata_blocks:
            if block_name in metadata:
                fields = metadata[block_name]['fields']
                extracted_keys, extracted_values = extract_value(fields)
                keys_list.extend(extracted_keys)
                values_list.extend(extracted_values)
                for item in fields:
                    if isinstance(item['value'], str):
                        try:
                            index = keys_list.index(item['typeName'])
                            values_list[index] = item['value']
                        except ValueError:
                            pass  # typeName wasn't added by extract_value

    except Exception as e:
        print(f"Error exporting metadata for DOI {doi}: {e}")

def extract_metadata(data,
                     citation_keys, citation_values,
                     geo_keys, geo_values,
                     social_keys, social_values,
                     astronomy_keys, astronomy_values,
                     biomedical_keys, biomedical_values,
                     journal_keys, journal_values,
                     computationalworkflow_keys, computationalworkflow_values,
                     LocalContextsCVoc_keys, LocalContextsCVoc_values,
                     darwincore_keys, darwincore_values,
                     flatten_dicts=False):
    """
    Extracts metadata key-value pairs and appends to `data`.
    If `flatten_dicts` is True, nested dictionaries are flattened into separate key-value pairs.
    """

    metadata_blocks = [
        (citation_keys, citation_values),
        (geo_keys, geo_values),
        (social_keys, social_values),
        (astronomy_keys, astronomy_values),
        (biomedical_keys, biomedical_values),
        (journal_keys, journal_values),
        (computationalworkflow_keys, computationalworkflow_values),
        (LocalContextsCVoc_keys, LocalContextsCVoc_values),
        (darwincore_keys, darwincore_values),
    ]

    for keys, values in metadata_blocks:
        for key, value in zip(keys, values):
            if isinstance(value, dict) and flatten_dicts:
                # Flatten nested dict (optional)
                for sub_key, sub_value in value.items():
                    flat_key = f"{key}.{sub_key}"
                    data.append([flat_key, sub_value])
            elif not isinstance(value, dict):
                data.append([key, value])


# Checking if both DOI and token are provided
if not doi or not token:
    print("Please enter both DOI and Token.")
else:
    # Extracting path from DOI
    path = doi.replace("doi:10.34810/", "")
    # Initializing API access
    api = NativeApi(base_url, token)
    data_api = DataAccessApi(base_url, token)
    # Getting dataset metadata
    dataset = api.get_dataset(doi)
    # Lists for metadata
    citation_keys, geo_keys, social_keys, astronomy_keys, biomedical_keys, journal_keys, computationalworkflow_keys, LocalContextsCVoc_keys, darwincore_keys  = [[] for _ in range(9)]
    citation_values, geo_values, social_values, astronomy_values, biomedical_values, journal_values, computationalworkflow_values, LocalContextsCVoc_values, darwincore_values = [[] for _ in range(9)]
    data=[]
    # Exporting metadata
    exportmetadata(base_url, token, doi, citation_keys, citation_values, geo_keys, geo_values, social_keys,
                      social_values, astronomy_keys, astronomy_values, biomedical_keys, biomedical_values,
                      journal_keys, journal_values,computationalworkflow_keys, computationalworkflow_values,
                      LocalContextsCVoc_keys, LocalContextsCVoc_values, darwincore_keys, darwincore_values)
    extract_metadata(
                      data,
                      citation_keys, citation_values,
                      geo_keys, geo_values,
                      social_keys, social_values,
                      astronomy_keys, astronomy_values,
                      biomedical_keys, biomedical_values,
                      journal_keys, journal_values,
                      computationalworkflow_keys, computationalworkflow_values,
                      LocalContextsCVoc_keys, LocalContextsCVoc_values,
                      darwincore_keys, darwincore_values,
                      flatten_dicts=False  # Optional: True to flatten nested dicts
                  )
    # Creating a DataFrame
    df = pd.DataFrame(data, columns=['Metadata', 'Value'])

    # Writing DataFrame to a CSV file
    df.to_csv(path + '_' + 'metadata.csv', index=False)

    # Writing DataFrame to an Excel file
    excel_file = path + '_' + 'metadata.xlsx'
    df.to_excel(excel_file, index=False)

if IN_COLAB:
    # Running in Google Colab
    # Providing a download button for CSV file
    csv_download_button = widgets.Button(description="Download metadata (CSV)")
    display(csv_download_button)

    # Function to be executed when the CSV download button is clicked
    def on_csv_download_button_click(b):
        # Downloading the metadata CSV file in Google Colab
        files.download(path + '_' + 'metadata.csv')

    # Event handler for the CSV download button
    csv_download_button.on_click(on_csv_download_button_click)

    # Providing a download button for Excel file
    excel_download_button = widgets.Button(description="Download metadata (Excel)")
    display(excel_download_button)

    # Function to be executed when the Excel download button is clicked
    def on_excel_download_button_click(b):
        # Downloading the metadata Excel file in Google Colab
        files.download(excel_file)

    # Event handler for the Excel download button
    excel_download_button.on_click(on_excel_download_button_click)

else:
    # Running in Jupyter Notebook
    # Providing a download link for CSV file
    csv_download_link = FileLink(path + '_' + 'metadata.csv', result_html_prefix="Click here to download metadata (CSV): ")
    display(csv_download_link)

    # Providing a download link for Excel file
    excel_download_link = FileLink(excel_file, result_html_prefix="Click here to download metadata (Excel): ")
    display(excel_download_link)
