# Script per a la Generació de Metadades de Publicacions Relacionades
# OBSERVACIÓ:
Si teniu dubtes sobre el codi, poseu-vos en contacte amb rdr-contacte@csuc.cat
## OBJECTIU DE L'SCRIPT
Aquest script permet avaluar i generar un tauler de metadades relacionades amb les publicacions associades a conjunts de dades. L'objectiu és extreure i agregar metadades específiques de cada conjunt de dades per a la seva posterior visualització i anàlisi en un fitxer Excel.

In [None]:
# @title Instal·leu o actualitzeu les llibreries (feu clic al botó Executar &#x25B6; )
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

def install_packages(b):
    """
    Function to install or update required Python packages.

    Args:
    b (widget): Button widget that triggers the installation process.

    Returns:
    None
    """
    clear_output(wait=True)
    !pip install --upgrade pip -q  # Upgrade pip silently
    !pip install pyDataverse -q    # Install or update pyDataverse silently
    !pip install requests -q       # Install or update requests silently
    print("Les llibreries s'han descarregat o actualitzat correctament.")

# Displaying installation message
display(HTML("<p style='font-size:14px;'><b>Feu clic al botó següent per instal·lar les llibreries necessàries.</b></p>"))

# Creating installation button
install_button = widgets.Button(description='Instal·lar llibreries')
install_button.on_click(install_packages)

# Displaying the installation button
display(install_button)

In [None]:
# @title Feu clic al botó Executar &#x25B6; , ompliu el token i trieu les institucions i les metadades
import subprocess
import smtplib
import os
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from pyDataverse.api import NativeApi, DataAccessApi, MetricsApi
from pyDataverse.models import Dataverse
import pandas as pd
import requests
import logging
from datetime import datetime
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
class UtilsConnection:
    def __init__(self, config):
        self.config = config

    def call_api(self, url, method, data=None):
        payload = {}
        headers = {'X-Dataverse-key': config.get_token()}

        response = requests.request("GET", url, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()

class Config:
    def __init__(self, api_url, logger, token):
        self.api_url = api_url
        self.logger = logger
        self.token = token

    def get_app_config(self):
        return self

    def get_api_url(self):
        return self.api_url

    def get_logger(self):
        return self.logger

    def get_token(self):
        return self.token

class DatasetProcessor:
    def __init__(self, config, identifier):
        self.config = config
        self.list_datasets = []
        self.list_dataverse_children = []
        self.identifier = identifier

    def update_list_dataset(self, dataset_id):
        self.list_datasets.append(dataset_id)

    def update_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.append(dataseverse_id)

    def remove_id_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.remove(dataseverse_id)

    def get_list_datasets(self):
        return self.list_datasets

    def get_list_dataverse_children(self):
        return self.list_dataverse_children

    def count(self):
        return len(self.list_datasets)

    def create_list_datasets(self, identifier):

        conn = UtilsConnection(self.config)

        url_api = f"{self.config.get_api_url()}/api/dataverses/{identifier}/contents"
        object_json = conn.call_api(url_api, "GET")

        if object_json:
            self.config.get_logger().info(f"Reading the API values")
            array_json = object_json.get("data", {})

            for value in array_json:
                if value['type'] == 'dataverse':
                    self.update_list_dataverse_children(value['id'])
                elif value['type'] == 'dataset' and value['protocol'] == 'doi':
                    self.update_list_dataset(value['protocol'] + ':' + value['authority'] + '/' + value['identifier'])
        else:
            self.config.get_logger().error(f"Call API ERROR")

        if not identifier == self.identifier:
            self.remove_id_list_dataverse_children(identifier)

        if len(self.get_list_dataverse_children()) != 0:

            self.create_list_datasets(self.get_list_dataverse_children()[0])
def extract_value(data_dict):
    """
    Function to extract type names and values from a JSON metadata dictionary.

    Args:
    data_dict (dict): JSON metadata dictionary.

    Returns:
    tuple: Type names and values extracted from the metadata dictionary.
    """
    if isinstance(data_dict, dict):
        type_names = []
        values = []
        for key, value in data_dict.items():
            if key == 'typeName' and 'value' in data_dict:
                if isinstance(data_dict['value'], list):
                    for v in data_dict['value']:
                        type_names.append(data_dict['typeName'])
                        values.append(v)
                else:
                    type_names.append(data_dict['typeName'])
                    values.append(data_dict['value'])
            elif isinstance(value, dict) and 'typeName' in value and 'value' in value:
                type_names.append(value['typeName'])
                values.append(value['value'])
            elif isinstance(value, str) and key == 'typeName':
                type_names.append(value)
                values.append(value)
            else:
                extracted_type_names, extracted_values = extract_value(value)
                type_names += extracted_type_names
                values += extracted_values
        return type_names, values
    elif isinstance(data_dict, list):
        type_names = []
        values = []
        for item in data_dict:
            extracted_type_names, extracted_values = extract_value(item)
            type_names += extracted_type_names
            values += extracted_values
        return type_names, values
    else:
        return [], []

def export_metadata(base_url, token, doi, citation_keys, citation_values, customUAB_keys, customUAB_values, stateDataset):
    """
    Function to export metadata from a dataset and store it in respective lists.

    Args:
    base_url (str): Base URL of the Dataverse repository.
    token (str): API token for authentication.
    doi (str): DOI of the dataset.
    citation_keys (list): List to store citation metadata keys.
    citation_values (list): List to store citation metadata values.
    Returns:
    None
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse
    api = NativeApi(base_url, token)  # Function to access the API
    data_api = DataAccessApi(base_url, token)  # Function to access data via the API
    try:
        dataset = api.get_dataset(doi)  # Retrieve dataset metadata


      # Extract citation metadata if available
        if 'citation' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_citation = dataset.json()['data']['latestVersion']['metadataBlocks']['citation']['fields']
            citation = extract_value(metadata_citation)
            citation_keys.extend(citation[0])
            citation_values.extend(citation[1])
            for item in metadata_citation:
                if isinstance(item['value'], str):
                    index_change = citation_keys.index(item['typeName'])
                    citation_values[index_change] = item['value']

     # Extract Library UAB metadata if available

        if 'customUAB' in dataset.json()['data']['latestVersion']['metadataBlocks']:
            metadata_customUAB = dataset.json()['data']['latestVersion']['metadataBlocks']['customUAB']['fields']
            customUAB = extract_value(metadata_customUAB)
            customUAB_keys.extend(customUAB[0])
            customUAB_values.extend(customUAB[1])
            for item in metadata_customUAB:
                if isinstance(item['value'], str):
                    index_change = customUAB_keys.index(item['typeName'])
                    customUAB_values[index_change] = item['value']

    except KeyError or InvalidSchema:
        print('S\'ha produït un error en llegir les metadades del conjunt de dades: ' + doi)

def extract_metadata(data, citation_keys, citation_values, customUAB_keys, customUAB_values):

    for key, value in zip(citation_keys, citation_values):
        if not isinstance(value, dict):
            data.append([key, value])

    for key, value in zip(customUAB_keys, customUAB_values):
        if not isinstance(value, dict):
            data.append([key, value])

# Configuration and execution
import logging
import ipywidgets as widgets
from IPython.display import display

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ask the user for the token
token = input("Introduïu el vostre token i premeu enter: ")

# Use the token in your configuration
config = Config(api_url="https://dataverse.csuc.cat/", logger=logger, token=token)

# List of institutions
institucions = [
    'UB', 'UAB', 'UPC', 'UPF', 'UdG', 'UdL', 'URV', 'UOC', 'UVIC-UCC',
    'URL', 'UIC', 'UIB', 'Agrotecnio', 'CED', 'CRAG', 'CREAF', 'CRM', 'CTFC',
    'i2CAT', 'I3PT', 'IBEC', 'IBEI', 'ICAC-CERCA', 'ICFO-CERCA', 'ICN2',
    'ICRA-CERCA', 'IDIBAPS', 'IDIBELL', 'IDIBGI-CERCA', 'IFAE', 'IJC',
    'IPHES-CERCA', 'IRBBarcelona-CERCA', 'IRB', 'IRSICAIXA', 'IRTA',
    'ISGLOBAL', 'VHIR'
]
# Add an option for selecting all
options_institucions = ['Totes les institucions'] + institucions

# Create widgets for instructions and selections
instruction_text_institucions = widgets.HTML(
    value="<b>Trieu una o més institucions:</b>"
)

# Create the widget for multiple selection of institutions
institucions_widget = widgets.SelectMultiple(
    options=options_institucions,
    value=[],
    description='Institucions:',
    disabled=False
)
# Function to save selected institutions and print them
opcions = set()
def save_selection_institucions(change):
    global opcions
    selected = set(change['new'])

    if 'Totes les institucions' in selected:
        opcions = set(institucions)  # Select all institutions if 'Totes les institucions' is chosen
    else:
        opcions.update(selected)  # Update the set with new selections

    print(f"Institucions: {list(opcions)}")

# Function to clear the selection and reset the institutions widget
def restart_selection_institucions(button):
    global opcions
    institucions_widget.value = []  # Clear the selections in the widget
    opcions.clear()  # Clear the global opcions set
    print("La selecció de la institució s'ha restablert.")

# Observe changes in the widget selection for institutions
institucions_widget.observe(save_selection_institucions, names='value')

# Create a button to restart the selection for institutions
restart_button_institucions = widgets.Button(description="Reiniciar la selecció d'institucions")
restart_button_institucions.on_click(restart_selection_institucions)

# Display the widgets and button with instructions

display(instruction_text_institucions, institucions_widget, restart_button_institucions)


########################



In [None]:
# @title Un cop escollides les institucions i les metadades, ciqueu el botó &#x25B6; per generar el taulell de metadades
selected_metadata = ["publicationRelationType","publicationCitation","publicationIDType","publicationIDNumber","publicationURL","reviewLibrary"]
metadata_keys_list = []
metadata_values_list = []
list_doi = []
instancia = []
states = []
for element in opcions:
    processor = DatasetProcessor(config, element)
    processor.create_list_datasets(element)
    sigles=element
    for i in processor.get_list_datasets():
        metadata_keys_aux=[]
        metadata_values_aux=[]
        #  Metadata lists:
        citation_keys, customUAB_keys, state = [[] for _ in range(3)]
        citation_values, customUAB_values, state = [[] for _ in range(3)]
        data = []
        stateDataset = []
        # Exporting metadata
        export_metadata(config.get_api_url(), config.get_token(), i, citation_keys, citation_values, customUAB_keys, customUAB_values, stateDataset)
        # Extracting metadata and arranging it
        extract_metadata(data, citation_keys, citation_values, customUAB_keys, customUAB_values)
        # Creating a DataFrame
        df = pd.DataFrame(data, columns=['Metadata', 'Value'])
        metadata_keys_aux = df['Metadata'].tolist()
        metadata_values_aux = df['Value'].tolist()
        metadata_keys_list.append(metadata_keys_aux)
        metadata_values_list.append(metadata_values_aux)
        instancia.append(sigles)
        list_doi.append(i)
        #states.append(stateDataset[0])
def aggregate_metadata(metadata_keys_list, metadata_values_list, list_doi, selected_metadata):
    from collections import defaultdict

    # Initialize dictionaries to store aggregated values
    metadata_values = defaultdict(lambda: defaultdict(set))

    # Aggregate values by DOI
    for i in range(len(metadata_keys_list)):
        doi = list_doi[i]
        for key, value in zip(metadata_keys_list[i], metadata_values_list[i]):
            if key in selected_metadata:
                if isinstance(value, list):
                    metadata_values[key][doi].update(value)
                else:
                    metadata_values[key][doi].add(value)

    # Convert sets to sorted lists
    aggregated_metadata = {field: [''] * len(list_doi) for field in selected_metadata}
    for field in selected_metadata:
        for doi in list_doi:
            values = list(metadata_values[field][doi])
            aggregated_metadata[field][list_doi.index(doi)] = '; '.join(values) if values else ''

    return aggregated_metadata
# Aggregate metadata values
metadata = aggregate_metadata(metadata_keys_list, metadata_values_list, list_doi, selected_metadata)

# Create the data dictionary
data = {
    'DOI': list_doi,
    'Institution': instancia
}

# Add the dynamic metadata fields
for field in selected_metadata:
    data[field] = metadata[field]

# Create the DataFrame
df = pd.DataFrame(data)

# Extract the numeric part of the DOI and convert it to int for sorting
df['DOI_Number'] = df['DOI'].str.extract(r'data(\d+)').astype(int)

# Sort the DataFrame based on the DOI_Number column
df = df.sort_values(by='DOI_Number')

# Format the DOI column as 'https://doi.org/10.34810/dataXXX'
df['DOI'] = 'https://doi.org/10.34810/data' + df['DOI_Number'].astype(str)

# Drop the DOI_Number column (optional)
df = df.drop(columns=['DOI_Number'])

df


In [9]:
# @title Clicar el botó &#x25B6; per guardar les dades en un fitxer excel.
from google.colab import files
# Save the DataFrame to an Excel file
excel_filename = 'estudi_publicacio_relacionada.xlsx'
df.to_excel(excel_filename, index=False)

# Provide download link for the file
files.download(excel_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>