<a href="https://colab.research.google.com/github/CSUC/RDR-scripts/blob/main/metrics/metrics_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script to Extract Metrics from Datasets in a Dataverse Instance
### OBSERVATION:
This script is available in the following GitHub repository: <a href='https://github.com/CSUC/RDR-scripts/tree/main/metrics' target='_blank'>RDR-scripts</a>. </p> If you have questions or doubts about the code, please contact rdr-contacte@csuc.cat.
### SCRIPT OBJECTIVE:
This script allows you to extract metrics from datasets in a Dataverse instance using the Dataverse API. The metrics include total views, unique views, total downloads, unique downloads, and citations.

In [None]:
# @title First click the &#x25B6; button to execute the script. </p> Then, enter the token (If you don't have your API token, you can get it from the following link <a href='https://dataverse.csuc.cat/dataverseuser.xhtml?selectTab=apiTokenTab' target='_blank'>Get API Token</a>).</p> Finally choose one or more institutions.
# Function to install required packages
import os
import subprocess
import sys
def install_packages():
    """
    Function to install or update necessary Python packages.
    """
    # Upgrade pip first
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "-q"])

    # Install the required libraries
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyDataverse", "-q"])

    print("Libraries have been downloaded or updated.")

# Install libraries if they are not installed already
try:
    import pyDataverse
except ImportError:
    print("Installing libraries...")
    install_packages()

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

from google.colab import output
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from pyDataverse.api import NativeApi, DataAccessApi, MetricsApi
from pyDataverse.models import Dataverse
import pandas as pd
import requests
import logging
from datetime import datetime

class UtilsConnection:
    def __init__(self, config):
        self.config = config

    def call_api(self, url, method, data=None):
        payload = {}
        headers = {'X-Dataverse-key': config.get_token()}

        response = requests.request("GET", url, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()

class Config:
    def __init__(self, api_url, logger, token):
        self.api_url = api_url
        self.logger = logger
        self.token = token

    def get_app_config(self):
        return self

    def get_api_url(self):
        return self.api_url

    def get_logger(self):
        return self.logger

    def get_token(self):
        return self.token

class DatasetProcessor:
    def __init__(self, config, identifier):
        self.config = config
        self.list_datasets = []
        self.list_dataverse_children = []
        self.identifier = identifier

    def update_list_dataset(self, dataset_id):
        self.list_datasets.append(dataset_id)

    def update_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.append(dataseverse_id)

    def remove_id_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.remove(dataseverse_id)

    def get_list_datasets(self):
        return self.list_datasets

    def get_list_dataverse_children(self):
        return self.list_dataverse_children

    def count(self):
        return len(self.list_datasets)

    def create_list_datasets(self, identifier):

        conn = UtilsConnection(self.config)

        url_api = f"{self.config.get_api_url()}/api/dataverses/{identifier}/contents"
        object_json = conn.call_api(url_api, "GET")

        if object_json:
            self.config.get_logger().info(f"Reading the API values")
            array_json = object_json.get("data", {})

            for value in array_json:
                if value['type'] == 'dataverse':
                    self.update_list_dataverse_children(value['id'])
                elif value['type'] == 'dataset' and value['protocol'] == 'doi':
                    self.update_list_dataset(value['protocol'] + ':' + value['authority'] + '/' + value['identifier'])
        else:
            self.config.get_logger().error(f"Call API ERROR")

        if not identifier == self.identifier:
            self.remove_id_list_dataverse_children(identifier)

        if len(self.get_list_dataverse_children()) != 0:

            self.create_list_datasets(self.get_list_dataverse_children()[0])

SERVER_URL = "https://dataverse.csuc.cat"# URL del repositori, pel PRE utilitzar https://dataverse.pre.csuc.cat/
token = input("Please enter your API token: ")
api = NativeApi(SERVER_URL,token)
data_api = DataAccessApi(SERVER_URL,token)
# Configuración y ejecución
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
config = Config(api_url="https://dataverse.csuc.cat/", logger=logger, token=token)
# List of institutions
institucions = [
    'UB', 'UAB', 'UPC', 'UPF', 'UdG', 'UdL', 'URV', 'UOC', 'UVIC-UCC',
    'URL', 'UIC', 'UIB', 'Agrotecnio', 'CED', 'CRAG', 'CREAF', 'CRM', 'CTFC','CVC',
    'i2CAT', 'I3PT', 'IBEC', 'IBEI', 'ICAC-CERCA', 'ICFO-CERCA','ICIQ', 'ICN2',
    'ICRA-CERCA', 'IDIBAPS', 'IDIBELL', 'IDIBGI-CERCA', 'IFAE', 'IJC','IRSantPau','CVC','IRSJD',
    'IPHES-CERCA', 'IRBBarcelona-CERCA', 'IRB', 'IRSICAIXA', 'IRTA','IRSJD'
    'ISGLOBAL', 'VHIR'
]
# Add an option for selecting all
options_institucions = ['All institutions'] + institucions

# Create widgets for instructions and selections
instruction_text_institucions = widgets.HTML(
    value="<b>Choose one or more institutions:</b>"
)

# Create the widget for multiple selection of institutions
institucions_widget = widgets.SelectMultiple(
    options=options_institucions,
    value=[],
    description='Institutions:',
    disabled=False
)
# Function to save selected institutions and print them
opcions = set()
def save_selection_institucions(change):
    global opcions
    selected = set(change['new'])

    if 'All institutions' in selected:
        opcions = set(institucions)  # Select all institutions if 'Totes les institucions' is chosen
    else:
        opcions.update(selected)  # Update the set with new selections

    print(f"Institutions: {list(opcions)}")

# Function to clear the selection and reset the institutions widget
def restart_selection_institucions(button):
    global opcions
    institucions_widget.value = []  # Clear the selections in the widget
    opcions.clear()  # Clear the global opcions set
    print("The institution selection has been restored.")

# Observe changes in the widget selection for institutions
institucions_widget.observe(save_selection_institucions, names='value')

# Create a button to restart the selection for institutions
restart_button_institucions = widgets.Button(description="Reset institution selection")
restart_button_institucions.on_click(restart_selection_institucions)

# Display the widgets and button with instructions

display(instruction_text_institucions, institucions_widget, restart_button_institucions)


In [None]:
# @title Once the institution is selected, click the &#x25B6; button to generate the data.</p> Finally, Clic <strong>Download file</strong> button.
def fetch_metric(doi, metric):
    try:
        url = f"{SERVER_URL}/api/datasets/:persistentId/makeDataCount/{metric}?persistentId={doi}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data["status"] == "OK" and "data" in data:
                if metric == "citations":
                    return len(data["data"])
                elif isinstance(data["data"], dict):
                    return data["data"].get(metric, None)
    except Exception as e:
        print(f"An error occurred while fetching {metric} for {doi}: {e}")
    return None
for element in opcions:
    processor = DatasetProcessor(config, element)
    processor.create_list_datasets(element)
    sigles=element
    metrics_data = []
    for i in processor.get_list_datasets():
        metrics_data.append({
            "DOI": i,
            "Institution": sigles,
            "Total Views": fetch_metric(i, "viewsTotal"),
            "Unique Views": fetch_metric(i, "viewsUnique"),
            "Total Downloads": fetch_metric(i, "downloadsTotal"),
            "Unique Downloads": fetch_metric(i, "downloadsUnique"),
            "Citations": fetch_metric(i, "citations")
        })

# Create a DataFrame from the metrics data
df = pd.DataFrame(metrics_data)

# Extract the numeric part of the DOI and convert it to int for sorting
df['DOI_Number'] = df['DOI'].str.extract(r'data(\d+)').astype(int)

# Sort the DataFrame based on the DOI_Number column
df = df.sort_values(by='DOI_Number')

# Format the DOI column as 'https://doi.org/10.34810/dataXXX'
df['DOI'] = 'https://doi.org/10.34810/data' + df['DOI_Number'].astype(str)

# Drop the DOI_Number column (optional)
df = df.drop(columns=['DOI_Number'])

from google.colab import files
# Save the DataFrame to an Excel file
excel_filename = 'datasets_sizes.xlsx'
df.to_excel(excel_filename, index=False)

# Display the DataFrame
display(df)

if IN_COLAB:
    # Running in Google Colab
    # Provide a download button
    download_button = widgets.Button(description="Download file")
    display(download_button)

    # Function to be executed when the download button is clicked
    def on_download_button_click(b):
        # Download the Readme.txt file in Google Colab
        files.download(excel_filename)

    # Event handler for the download button
    download_button.on_click(on_download_button_click)

else:
    # Running in Jupyter Notebook
    # Provide a download link
    download_link = FileLink(excel_filename, result_html_prefix="Click to download the file: ")
    display(download_link)