# Script per extreure mètriques ddels datasets d'una instància

OBSERVACIÓ: Si teniu dubtes del codi, contactar amb rdr-contacte@csuc.cat

## OBJECTIU DE L'SCRIPT

L'objectiu principal d'aquest script és extreure les mètriques dels datasets d'una instància.

In [None]:
# @title Instal·lar o actualitzar llibreries (Clicar botó d'executar &#x25B6; )
from google.colab import output
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# Function to install required packages
def install_packages(b):
    clear_output(wait=True)
    !pip install --upgrade pip -q
    !pip install pyDataverse -q
    !pip install numpy -q
    !pip install numpy pandas -q
    !pip install openpyxl -q
    print("S'han descarregat o actualitzat les llibreries.")

# Displaying installation message
display(HTML("<p style='font-size:14px;'><b>Feu clic al botó següent per instal·lar les llibreries.</b></p>"))

# Creating installation button
install_button = widgets.Button(description='Instal·lar llibreries')
install_button.on_click(install_packages)

# Displaying the installation button
display(install_button)

In [None]:
# @title Feu clic al botó Executar &#x25B6; , ompliu el token i l'alias de la institució

import subprocess
import smtplib
import os
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from pyDataverse.api import NativeApi, DataAccessApi, MetricsApi
from pyDataverse.models import Dataverse
import pandas as pd
import requests
import logging
from datetime import datetime

class UtilsConnection:
    def __init__(self, config):
        self.config = config

    def call_api(self, url, method, data=None):
        payload = {}
        headers = {'X-Dataverse-key': config.get_token()}

        response = requests.request("GET", url, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()

class Config:
    def __init__(self, api_url, logger, token):
        self.api_url = api_url
        self.logger = logger
        self.token = token

    def get_app_config(self):
        return self

    def get_api_url(self):
        return self.api_url

    def get_logger(self):
        return self.logger

    def get_token(self):
        return self.token

class DatasetProcessor:
    def __init__(self, config, identifier):
        self.config = config
        self.list_datasets = []
        self.list_dataverse_children = []
        self.identifier = identifier

    def update_list_dataset(self, dataset_id):
        self.list_datasets.append(dataset_id)

    def update_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.append(dataseverse_id)

    def remove_id_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.remove(dataseverse_id)

    def get_list_datasets(self):
        return self.list_datasets

    def get_list_dataverse_children(self):
        return self.list_dataverse_children

    def count(self):
        return len(self.list_datasets)

    def create_list_datasets(self, identifier):

        conn = UtilsConnection(self.config)

        url_api = f"{self.config.get_api_url()}/api/dataverses/{identifier}/contents"
        object_json = conn.call_api(url_api, "GET")

        if object_json:
            self.config.get_logger().info(f"Reading the API values")
            array_json = object_json.get("data", {})

            for value in array_json:
                if value['type'] == 'dataverse':
                    self.update_list_dataverse_children(value['id'])
                elif value['type'] == 'dataset' and value['protocol'] == 'doi':
                    self.update_list_dataset(value['protocol'] + ':' + value['authority'] + '/' + value['identifier'])
        else:
            self.config.get_logger().error(f"Call API ERROR")

        if not identifier == self.identifier:
            self.remove_id_list_dataverse_children(identifier)

        if len(self.get_list_dataverse_children()) != 0:

            self.create_list_datasets(self.get_list_dataverse_children()[0])

SERVER_URL = "https://dataverse.csuc.cat"# URL del repositori, pel PRE utilitzar https://dataverse.pre.csuc.cat/
token= '93104f52-1a9e-4081-b946-b5d3f48ca9fe' #token privat d'Admin, consultar-lo a la interfície de CORA.RDR
api = NativeApi(SERVER_URL,token)
data_api = DataAccessApi(SERVER_URL,token)
# Configuración y ejecución
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
token = input("Introduïu el vostre token i premeu enter: ")
alias = input("Introduïu l'alias de la institució: ")
config = Config(api_url="https://dataverse.csuc.cat/", logger=logger, token=token)
processor = DatasetProcessor(config, alias)
processor.create_list_datasets(alias)
metrics_data = []
def fetch_metric(doi, metric):
    try:
        url = f"{SERVER_URL}/api/datasets/:persistentId/makeDataCount/{metric}?persistentId={doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data["status"] == "OK" and "data" in data:
                if metric == "citations":
                    return len(data["data"])
                elif isinstance(data["data"], dict):
                    return data["data"].get(metric, None)
    except Exception as e:
        print(f"An error occurred while fetching {metric} for {doi}: {e}")
    return None
for i in processor.get_list_datasets():
    metrics_data.append({
        "DOI": i,
        "Total Views": fetch_metric(i, "viewsTotal"),
        "Unique Views": fetch_metric(i, "viewsUnique"),
        "Total Downloads": fetch_metric(i, "downloadsTotal"),
        "Unique Downloads": fetch_metric(i, "downloadsUnique"),
        "Citations": fetch_metric(i, "citations")
    })

# Create a DataFrame from the metrics data
df = pd.DataFrame(metrics_data)

# Extract the numeric part of the DOI and convert it to int for sorting
df['DOI_Number'] = df['DOI'].str.extract(r'data(\d+)').astype(int)

# Sort the DataFrame based on the DOI_Number column
df = df.sort_values(by='DOI_Number')

# Format the DOI column as 'https://doi.org/10.34810/dataXXX'
df['DOI'] = 'https://doi.org/10.34810/data' + df['DOI_Number'].astype(str)

# Drop the DOI_Number column (optional)
df = df.drop(columns=['DOI_Number'])

df

In [None]:
# @title Clicar el botó &#x25B6; per guardar les dades en un fitxer excel.
from google.colab import files
# Save the DataFrame to an Excel file
excel_filename = 'dataverse_metrics.xlsx'
df.to_excel(excel_filename, index=False)
# Provide download link for the file
files.download(excel_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>