#### Script per extreure les mides dels datasets d'una instància
# OBSERVACIÓ:
Si teniu dubtes sobre el codi, poseu-vos en contacte amb rdr-contacte@csuc.cat
## OBJECTIU DE L'SCRIPT
Aquest script permet calcular la mida total dels conjunt de dades (datasets) allotjat a una instància del Repositori de Dades de Recerca (https://dataverse.csuc.cat/). Utilitza l'API de Dataverse per obtenir la mida de tots els fitxers associats als datasets d'una instància i retorna la mida total en bytes, KB, MB o GB.


In [None]:
# @title Instal·leu o actualitzeu les llibreries (feu clic al botó Executar &#x25B6; )
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

def install_packages(b):
    """
    Function to install or update required Python packages.

    Args:
    b (widget): Button widget that triggers the installation process.

    Returns:
    None
    """
    clear_output(wait=True)
    !pip install --upgrade pip -q  # Upgrade pip silently
    !pip install pyDataverse -q    # Install or update pyDataverse silently
    !pip install requests -q       # Install or update requests silently
    print("Les llibreries s'han descarregat o actualitzat correctament.")

# Displaying installation message
display(HTML("<p style='font-size:14px;'><b>Feu clic al botó següent per instal·lar les llibreries necessàries.</b></p>"))

# Creating installation button
install_button = widgets.Button(description='Instal·lar llibreries')
install_button.on_click(install_packages)

# Displaying the installation button
display(install_button)

In [None]:
# @title Feu clic al botó Executar &#x25B6; , ompliu el token i trieu les institucions i les metadades
import subprocess
import smtplib
import os
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from pyDataverse.api import NativeApi, DataAccessApi, MetricsApi
from pyDataverse.models import Dataverse
import pandas as pd
import requests
import logging
from datetime import datetime
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
class UtilsConnection:
    def __init__(self, config):
        self.config = config

    def call_api(self, url, method, data=None):
        payload = {}
        headers = {'X-Dataverse-key': config.get_token()}

        response = requests.request("GET", url, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()

class Config:
    def __init__(self, api_url, logger, token):
        self.api_url = api_url
        self.logger = logger
        self.token = token

    def get_app_config(self):
        return self

    def get_api_url(self):
        return self.api_url

    def get_logger(self):
        return self.logger

    def get_token(self):
        return self.token

class DatasetProcessor:
    def __init__(self, config, identifier):
        self.config = config
        self.list_datasets = []
        self.list_dataverse_children = []
        self.identifier = identifier

    def update_list_dataset(self, dataset_id):
        self.list_datasets.append(dataset_id)

    def update_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.append(dataseverse_id)

    def remove_id_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.remove(dataseverse_id)

    def get_list_datasets(self):
        return self.list_datasets

    def get_list_dataverse_children(self):
        return self.list_dataverse_children

    def count(self):
        return len(self.list_datasets)

    def create_list_datasets(self, identifier):

        conn = UtilsConnection(self.config)

        url_api = f"{self.config.get_api_url()}/api/dataverses/{identifier}/contents"
        object_json = conn.call_api(url_api, "GET")

        if object_json:
            self.config.get_logger().info(f"Reading the API values")
            array_json = object_json.get("data", {})

            for value in array_json:
                if value['type'] == 'dataverse':
                    self.update_list_dataverse_children(value['id'])
                elif value['type'] == 'dataset' and value['protocol'] == 'doi':
                    self.update_list_dataset(value['protocol'] + ':' + value['authority'] + '/' + value['identifier'])
        else:
            self.config.get_logger().error(f"Call API ERROR")

        if not identifier == self.identifier:
            self.remove_id_list_dataverse_children(identifier)

        if len(self.get_list_dataverse_children()) != 0:

            self.create_list_datasets(self.get_list_dataverse_children()[0])

def filemetadata(base_url, token, doi, filemetadata_keys, filemetadata_values):
    """
    Function to extract metadata for files associated with a dataset identified by its DOI.

    Parameters:
    - base_url: str. Base URL of the Dataverse instance.
    - token: str. API token for authentication.
    - doi: str. DOI of the dataset.
    - filemetadata_keys: list. List to store file metadata keys.
    - filemetadata_values: list. List to store file metadata values.

    Returns:
    - None. Updates the provided lists with extracted file metadata.
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse

    # Instantiate API objects for accessing Dataverse
    api = NativeApi(base_url, token)
    data_api = DataAccessApi(base_url, token)

    try:
        # Retrieve dataset metadata
        if(doi != 'doi:10.34810/data1872'):
          dataset = api.get_dataset(doi)
          # Iterate through files and extract metadata
          for i in range(len(dataset.json()['data']['latestVersion']['files'])):
              filemetadata_resp = dataset.json()['data']['latestVersion']['files'][i]['dataFile']
              filemetadata_keys_aux = list(filemetadata_resp.keys())
              filemetadata_values_aux = list(filemetadata_resp.values())
              filemetadata_keys.append(filemetadata_keys_aux)
              filemetadata_values.append(filemetadata_values_aux)
    except KeyError:
        print('There was an error reading metadata for the files of the dataset: ' + doi)

# Configuration and execution
import logging
import ipywidgets as widgets
from IPython.display import display

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ask the user for the token
token = input("Introduïu el vostre token i premeu enter: ")

# Use the token in your configuration
config = Config(api_url="https://dataverse.csuc.cat/", logger=logger, token=token)

# List of institutions
institucions = [
    'UB', 'UAB', 'UPC', 'UPF', 'UdG', 'UdL', 'URV', 'UOC', 'UVIC-UCC',
    'URL', 'UIC', 'UIB', 'Agrotecnio', 'CED', 'CRAG', 'CREAF', 'CRM', 'CTFC',
    'i2CAT', 'I3PT', 'IBEC', 'IBEI', 'ICAC-CERCA', 'ICFO-CERCA', 'ICN2',
    'ICRA-CERCA', 'IDIBAPS', 'IDIBELL', 'IDIBGI-CERCA', 'IFAE', 'IJC','IRSantPau','CVC','IRSJD',
    'IPHES-CERCA', 'IRBBarcelona-CERCA', 'IRB', 'IRSICAIXA', 'IRTA',
    'ISGLOBAL', 'VHIR'
]
# Add an option for selecting all
options_institucions = ['Totes les institucions'] + institucions

# Create widgets for instructions and selections
instruction_text_institucions = widgets.HTML(
    value="<b>Trieu una o més institucions:</b>"
)

# Create the widget for multiple selection of institutions
institucions_widget = widgets.SelectMultiple(
    options=options_institucions,
    value=[],
    description='Institucions:',
    disabled=False
)
# Function to save selected institutions and print them
opcions = set()
def save_selection_institucions(change):
    global opcions
    selected = set(change['new'])

    if 'Totes les institucions' in selected:
        opcions = set(institucions)  # Select all institutions if 'Totes les institucions' is chosen
    else:
        opcions.update(selected)  # Update the set with new selections

    print(f"Institucions: {list(opcions)}")

# Function to clear the selection and reset the institutions widget
def restart_selection_institucions(button):
    global opcions
    institucions_widget.value = []  # Clear the selections in the widget
    opcions.clear()  # Clear the global opcions set
    print("La selecció de la institució s'ha restablert.")

# Observe changes in the widget selection for institutions
institucions_widget.observe(save_selection_institucions, names='value')

# Create a button to restart the selection for institutions
restart_button_institucions = widgets.Button(description="Reiniciar la selecció d'institucions")
restart_button_institucions.on_click(restart_selection_institucions)

# Display the widgets and button with instructions

display(instruction_text_institucions, institucions_widget, restart_button_institucions)

In [None]:
# @title Un cop escollides les institucions, ciqueu el botó &#x25B6; per generar el taulell de mides dels datasets

from collections import defaultdict
import pandas as pd

import pandas as pd

def get_dataset_sizes(base_url, token,doi):
    api = NativeApi(base_url, token)
    dataset = api.get_dataset(doi)
    filemetadata_keys = []
    filemetadata_values = []
    filemetadata(base_url, token, doi, filemetadata_keys, filemetadata_values)

    if not filemetadata_keys or not filemetadata_values:
        return 0, 0  # Retorna 0 si no hay metadatos

    def get_index(key_list, key):
        return key_list.index(key) if key in key_list else None

    def get_size(entry, key_list):
        original_index = get_index(key_list, 'originalFileSize')
        file_index = get_index(key_list, 'filesize')
        if original_index is not None and isinstance(entry[original_index], int):
            return entry[original_index]
        return entry[file_index] if file_index is not None and isinstance(entry[file_index], int) else 0

    sizes = [get_size(entry, filemetadata_keys[i]) for i, entry in enumerate(filemetadata_values)]
    total_original_size_bytes = sum(sizes)

    filesize_index = get_index(filemetadata_keys[0], 'filesize') if filemetadata_keys else None
    total_archival_size_bytes = sum(entry[filesize_index] for entry in filemetadata_values if filesize_index is not None and isinstance(entry[filesize_index], int))

    return total_original_size_bytes, total_archival_size_bytes

def format_size(size_in_bytes):
    units = ["Bytes", "KB", "MB", "GB", "TB"]
    size = float(size_in_bytes)
    unit_index = 0
    while size >= 1024 and unit_index < len(units) - 1:
        size /= 1024
        unit_index += 1
    return f"{size:.2f}".replace('.', ','), units[unit_index]  # Replaces dot with comma

base_url="https://dataverse.csuc.cat/"
data = []
for element in opcions:
    processor = DatasetProcessor(config, element)
    processor.create_list_datasets(element)
    sigles=element
    for i in processor.get_list_datasets():
      original_size, archival_size = get_dataset_sizes(base_url,token,i)
      formatted_original, unit_original = format_size(original_size)
      formatted_archival, unit_archival = format_size(archival_size)
      data.append([i, sigles, original_size, archival_size, float(formatted_original.replace(',', '.')), unit_original, float(formatted_archival.replace(',', '.')), unit_archival])
df = pd.DataFrame(data, columns=["DOI","Institució", "Original Size (Bytes)", "Archival Size (Bytes)",  "Formatted Original Size", "Unit (Original Size)", "Formatted Archival Size", "Unit (Archival Size)" ])
df



In [None]:
# @title Clicar el botó &#x25B6; per guardar les dades en un fitxer excel.
from google.colab import files
# Save the DataFrame to an Excel file
excel_filename = 'mida_datasets.xlsx'
df.to_excel(excel_filename, index=False)

# Provide download link for the file
files.download(excel_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>