# Extreure metadades a un fitxer tabular

OBSERVACIÓ: No modifiqueu aquest document. Si necessiteu modificar-lo, feu-ne una còpia. Si teniu dubtes del codi, contactar amb rdr-contacte@csuc.cat

## OBJECTIU DE L'SCRIPT

L'objectiu principal d'aquest script és exportar les metadades d'un dataset a un fitxer tabular CSV o excel.

## IMPORTANT: Només Emplenar les variables token i doi i executar tot

In [None]:
# @title Install or Update Libraries. Click the Run button &#x25B6;
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

def install_packages(b):
    """
    Function to install or update required Python libraries.

    Args:
    b: Button object.

    Returns:
    None
    """
    clear_output(wait=True)
    !pip install --upgrade pip -q
    !pip install pyDataverse -q
    !pip install html2text -q
    print("Libraries have been downloaded or updated.")

# Display installation message
display(HTML("<p style='font-size:14px;'><b>Click the button below to install the libraries.</b></p>"))

# Create installation button
install_button = widgets.Button(description='Install Libraries')
install_button.on_click(install_packages)

# Display the installation button
display(install_button)


In [None]:
# @title Enter DOI (doi:10.34810/dataXXX), token, and the name of the Excel file with the .xlsx extension and the repository URL. Click the Run button ▶️
from IPython.display import display, FileLink
try:
    from google.colab import files
    import ipywidgets as widgets
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
import os
from pyDataverse.api import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse
import pandas as pd

# Provide input values
doi = ""  # @param {type:"string"}
token = ""  # @param {type:"string"}
excel_file_name = "" # @param {type:"string"}
base_url = "https://dataverse.csuc.cat/"  # @param {type:"string"} 

def extract_value(data_dict):
    """
    Function to extract type names and values from a JSON metadata dictionary.

    Args:
    data_dict (dict): JSON metadata dictionary.

    Returns:
    tuple: Type names and values extracted from the metadata dictionary.
    """
    if isinstance(data_dict, dict):
        type_names = []
        values = []
        for key, value in data_dict.items():
            if key == 'typeName' and 'value' in data_dict:
                if isinstance(data_dict['value'], list):
                    for v in data_dict['value']:
                        type_names.append(data_dict['typeName'])
                        values.append(v)
                else:
                    type_names.append(data_dict['typeName'])
                    values.append(data_dict['value'])
            elif isinstance(value, dict) and 'typeName' in value and 'value' in value:
                type_names.append(value['typeName'])
                values.append(value['value'])
            elif isinstance(value, str) and key == 'typeName':
                type_names.append(value)
                values.append(value)
            else:
                extracted_type_names, extracted_values = extract_value(value)
                type_names += extracted_type_names
                values += extracted_values
        return type_names, values
    elif isinstance(data_dict, list):
        type_names = []
        values = []
        for item in data_dict:
            extracted_type_names, extracted_values = extract_value(item)
            type_names += extracted_type_names
            values += extracted_values
        return type_names, values
    else:
        return [], []

def export_metadata(base_url, token, doi, citation_keys, citation_values, geo_keys, geo_values, social_keys,
                    social_values, astronomy_keys, astronomy_values, biomedical_keys, biomedical_values,
                    journal_keys, journal_values):
    """
    Function to export metadata from a dataset and store it in respective lists.

    Args:
    base_url (str): Base URL of the Dataverse repository.
    token (str): API token for authentication.
    doi (str): DOI of the dataset.
    citation_keys (list): List to store citation metadata keys.
    citation_values (list): List to store citation metadata values.
    geo_keys (list): List to store geospatial metadata keys.
    geo_values (list): List to store geospatial metadata values.
    social_keys (list): List to store social science metadata keys.
    social_values (list): List to store social science metadata values.
    astronomy_keys (list): List to store astronomy metadata keys.
    astronomy_values (list): List to store astronomy metadata values.
    biomedical_keys (list): List to store biomedical metadata keys.
    biomedical_values (list): List to store biomedical metadata values.
    journal_keys (list): List to store journal metadata keys.
    journal_values (list): List to store journal metadata values.

    Returns:
    None
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse
    api = NativeApi(base_url, token)  # Function to access the API
    data_api = DataAccessApi(base_url, token)  # Function to access data via the API
    try:
        dataset = api.get_dataset(doi)  # Retrieve dataset metadata
        if 'citation' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_citation = dataset.json()['data']['latestVersion']['metadataBlocks']['citation']['fields']
            citation = extract_value(metadata_citation)
            citation_keys.extend(citation[0])
            citation_values.extend(citation[1])
            for item in metadata_citation:
                if isinstance(item['value'], str):
                    index_change = citation_keys.index(item['typeName'])
                    citation_values[index_change] = item['value']
        if 'geospatial' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_geospatial = dataset.json()['data']['latestVersion']['metadataBlocks']['geospatial']['fields']
            geospatial = extract_value(metadata_geospatial)
            geo_keys.extend(geospatial[0])
            geo_values.extend(geospatial[1])
            for item in metadata_geospatial:
                if isinstance(item['value'], str):
                    index_change = geo_keys.index(item['typeName'])
                    geo_values[index_change] = item['value']
        if 'socialscience' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_socialscience = dataset.json()['data']['latestVersion']['metadataBlocks']['socialscience']['fields']
            socialscience = extract_value(metadata_socialscience)
            social_keys.extend(socialscience[0])
            social_values.extend(socialscience[1])
            for item in metadata_socialscience:
                if isinstance(item['value'], str):
                    index_change = social_keys.index(item['typeName'])
                    social_values[index_change] = item['value']
        if 'astrophysics' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_astrophysics = dataset.json()['data']['latestVersion']['metadataBlocks']['astrophysics']['fields']
            astrophysics = extract_value(metadata_astrophysics)
            astronomy_keys.extend(astrophysics[0])
            astronomy_values.extend(astrophysics[1])
            for item in metadata_astrophysics:
                if isinstance(item['value'], str):
                    index_change = astronomy_keys.index(item['typeName'])
                    astronomy_values[index_change] = item['value']
        if 'biomedical' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_biomedical = dataset.json()['data']['latestVersion']['metadataBlocks']['biomedical']['fields']
            biomedical = extract_value(metadata_biomedical)
            biomedical_keys.extend(biomedical[0])
            biomedical_values.extend(biomedical[1])
            for item in metadata_biomedical:
                if isinstance(item['value'], str):
                    index_change = biomedical_keys.index(item['typeName'])
                    biomedical_values[index_change] = item['value']
        if 'journal' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_journal = dataset.json()['data']['latestVersion']['metadataBlocks']['journal']['fields']
            journal = extract_value(metadata_journal)
            journal_keys.extend(journal[0])
            journal_values.extend(journal[1])
            for item in metadata_journal:
                if isinstance(item['value'], str):
                    index_change = journal_keys.index(item['typeName'])
                    journal_values[index_change] = item['value']
    except (KeyError, InvalidSchema) as e:
        print(f"Error occurred: {e}")
        print('There was an error reading metadata for the dataset: ' + doi)

def extractmetadata(data,citation_keys,citation_values,geo_keys,geo_values,social_keys,social_values,astronomy_keys,astronomy_values,biomedical_keys,biomedical_values,journal_keys,journal_values):
    for key, value in zip(citation_keys, citation_values):
        if not isinstance(value, dict):
            data.append([key, value])
    for key, value in zip(geo_keys, geo_values):
        if not isinstance(value, dict):
            data.append([key, value])
    for key, value in zip(social_keys, social_values):
        if not isinstance(value, dict):
            data.append([key, value])
    for key, value in zip(astronomy_keys, astronomy_values):
        if not isinstance(value, dict):
            data.append([key, value])
    for key, value in zip(biomedical_keys, biomedical_values):
        if not isinstance(value, dict):
            data.append([key, value])
    for key, value in zip(journal_keys, journal_values):
        if not isinstance(value, dict):
            data.append([key, value])
# Checking if both DOI and token are provided
if not doi or not token:
    print("Please enter both DOI and Token.")
else:
    # Extracting path from DOI
    path = doi.replace("doi:10.34810/", "")
    # Initializing API access
    api = NativeApi(base_url, token)
    data_api = DataAccessApi(base_url, token)
    # Getting dataset metadata
    dataset = api.get_dataset(doi)
    # Lists for metadata
    citation_keys, geo_keys, social_keys, astronomy_keys, biomedical_keys, journal_keys = [[] for _ in range(6)]
    citation_values, geo_values, social_values, astronomy_values, biomedical_values, journal_values = [[] for _ in range(6)]
    data = []
    # Exporting metadata
    exportmetadata(base_url, token, doi, citation_keys, citation_values, geo_keys, geo_values, social_keys,
                   social_values, astronomy_keys, astronomy_values, biomedical_keys, biomedical_values,
                   journal_keys, journal_values)
    # Extracting metadata and arranging it
    extractmetadata(data, citation_keys, citation_values, geo_keys, geo_values, social_keys, social_values,
                     astronomy_keys, astronomy_values, biomedical_keys, biomedical_values, journal_keys,
                     journal_values)
    # Creating a DataFrame
    df = pd.DataFrame(data, columns=['Metadata', 'Value'])

    # Writing DataFrame to a CSV file
    df.to_csv(path + '_' + 'metadata.csv', index=False)

if IN_COLAB:
    # Running in Google Colab
    # Providing a download button
    download_button = widgets.Button(description="Download metadata")
    display(download_button)

    # Function to be executed when the download button is clicked
    def on_download_button_click(b):
        # Downloading the metadata.csv file in Google Colab
        files.download(file_path)

    # Event handler for the download button
    download_button.on_click(on_download_button_click)

else:
    # Running in Jupyter Notebook
    # Providing a download link
    file_path = path + '_' + 'metadata.csv'
    download_link = FileLink(file_path, result_html_prefix="Click here to download: ")
    display(download_link)
