# Script to verify if the Readme file exists
### OBSERVATION:
This script is available in the following GitHub repository: <a href='https://github.com/CSUC/RDR-scripts/tree/main/related_publication_check' target='_blank'>RDR-scripts</a>. </p> If you have questions or doubts about the code, please contact rdr-contacte@csuc.cat.
### SCRIPT OBJECTIVE:
The main objective of this script is to verify if the README file is in the datasets of an institution.

In [None]:
# @title First click the &#x25B6; button to execute the script. </p> Then, enter the token (If you don't have your API token, you can get it from the following link <a href='https://dataverse.csuc.cat/dataverseuser.xhtml?selectTab=apiTokenTab' target='_blank'>Get API Token</a>).</p> After that, enter the LAST DIGITS of the DOI (for example, if the DOI ends in <strong>dataXYZ</strong>, only write the number <strong>XYZ</strong> ).</p> Finally click <strong>Download Readme verification file</strong> to download the file.
import os
import subprocess
import sys

# Function to install required packages
def install_packages():
    """
    Function to install or update necessary Python packages.
    """
    # Upgrade pip first
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "-q"])

    # Install the required libraries
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyDataverse", "-q"])


    print("Libraries have been downloaded or updated.")

# Install libraries if they are not installed already
try:
    import pyDataverse
except ImportError:
    print("Installing libraries...")
    install_packages()

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

import ipywidgets as widgets
from IPython.display import display, FileLink
from google.colab import files
from IPython.display import display, HTML

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from pyDataverse.api import NativeApi, DataAccessApi, MetricsApi
from pyDataverse.models import Dataverse
import pandas as pd
import requests
import logging
from datetime import datetime
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
class UtilsConnection:
    def __init__(self, config):
        self.config = config

    def call_api(self, url, method, data=None):
        payload = {}
        headers = {'X-Dataverse-key': config.get_token()}

        response = requests.request("GET", url, headers=headers, data=payload)
        response.raise_for_status()
        return response.json()

class Config:
    def __init__(self, api_url, logger, token):
        self.api_url = api_url
        self.logger = logger
        self.token = token

    def get_app_config(self):
        return self

    def get_api_url(self):
        return self.api_url

    def get_logger(self):
        return self.logger

    def get_token(self):
        return self.token

class DatasetProcessor:
    def __init__(self, config, identifier):
        self.config = config
        self.list_datasets = []
        self.list_dataverse_children = []
        self.identifier = identifier

    def update_list_dataset(self, dataset_id):
        self.list_datasets.append(dataset_id)

    def update_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.append(dataseverse_id)

    def remove_id_list_dataverse_children(self, dataseverse_id):
        self.list_dataverse_children.remove(dataseverse_id)

    def get_list_datasets(self):
        return self.list_datasets

    def get_list_dataverse_children(self):
        return self.list_dataverse_children

    def count(self):
        return len(self.list_datasets)

    def create_list_datasets(self, identifier):

        conn = UtilsConnection(self.config)

        url_api = f"{self.config.get_api_url()}/api/dataverses/{identifier}/contents"
        object_json = conn.call_api(url_api, "GET")

        if object_json:
            self.config.get_logger().info(f"Reading the API values")
            array_json = object_json.get("data", {})

            for value in array_json:
                if value['type'] == 'dataverse':
                    self.update_list_dataverse_children(value['id'])
                elif value['type'] == 'dataset' and value['protocol'] == 'doi':
                    self.update_list_dataset(value['protocol'] + ':' + value['authority'] + '/' + value['identifier'])
        else:
            self.config.get_logger().error(f"Call API ERROR")

        if not identifier == self.identifier:
            self.remove_id_list_dataverse_children(identifier)

        if len(self.get_list_dataverse_children()) != 0:

            self.create_list_datasets(self.get_list_dataverse_children()[0])
def export_metadata(base_url, token, doi, data_keys, data_values, stateDataset):
    """
    Function to export metadata from a dataset and store it in respective lists.

    Args:
    base_url (str): Base URL of the Dataverse repository.
    token (str): API token for authentication.
    doi (str): DOI of the dataset.
    Returns:
    None
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse
    api = NativeApi(base_url, token)  # Function to access the API
    data_api = DataAccessApi(base_url, token)  # Function to access data via the API
    try:
        dataset = api.get_dataset(doi)  # Retrieve dataset metadata

      # Extract dataset dates metadata if available
        dataset_data = dataset.json()['data']['latestVersion']  # Extract once

        if "publicationDate" in dataset_data:
          data_keys.append("publicationDate")
          data_values.append(dataset_data["publicationDate"])

    except KeyError or InvalidSchema:
      pass

def filemetadata(base_url, token, doi, filemetadata_keys, filemetadata_values):
    """
    Function to extract metadata for files associated with a dataset identified by its DOI.

    Parameters:
    - base_url: str. Base URL of the Dataverse instance.
    - token: str. API token for authentication.
    - doi: str. DOI of the dataset.
    - filemetadata_keys: list. List to store file metadata keys.
    - filemetadata_values: list. List to store file metadata values.

    Returns:
    - None. Updates the provided lists with extracted file metadata.
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse

    # Instantiate API objects for accessing Dataverse
    api = NativeApi(base_url, token)
    data_api = DataAccessApi(base_url, token)

    try:
        # Retrieve dataset metadata
        if(doi != 'doi:10.34810/data1872'):
          dataset = api.get_dataset(doi)
          # Iterate through files and extract metadata
          for i in range(len(dataset.json()['data']['latestVersion']['files'])):
              filemetadata_resp = dataset.json()['data']['latestVersion']['files'][i]['dataFile']
              filemetadata_keys_aux = list(filemetadata_resp.keys())
              filemetadata_values_aux = list(filemetadata_resp.values())
              filemetadata_keys.append(filemetadata_keys_aux)
              filemetadata_values.append(filemetadata_values_aux)
    except KeyError:
        print('There was an error reading metadata for the files of the dataset: ' + doi)

# Configuration and execution
import logging
import ipywidgets as widgets
from IPython.display import display

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ask the user for the token
token = input("Please enter your API token: ")

# The base URL is always fixed, no need to ask the user
base_url = "https://dataverse.csuc.cat/"

# Use the token in your configuration
config = Config(api_url="https://dataverse.csuc.cat/", logger=logger, token=token)

# List of institutions
institucions = [
    'UB', 'UAB', 'UPC', 'UPF', 'UdG', 'UdL', 'URV', 'UOC', 'UVIC-UCC',
    'URL', 'UIC', 'UIB', 'Agrotecnio', 'CED', 'CRAG', 'CREAF', 'CRM', 'CTFC','CVC',
    'i2CAT', 'I3PT', 'IBEC', 'IBEI', 'ICAC-CERCA', 'ICFO-CERCA','ICIQ', 'ICN2',
    'ICRA-CERCA', 'IDIBAPS', 'IDIBELL', 'IDIBGI-CERCA', 'IFAE', 'IJC','IRSantPau','CVC','IRSJD',
    'IPHES-CERCA', 'IRBBarcelona-CERCA', 'IRB', 'IRSICAIXA', 'IRTA','IRSJD',
    'ISGLOBAL', 'VHIR'
]

# Add an option for selecting all
options_institucions = ['All institutions'] + institucions

# Create widgets for instructions and selections
instruction_text_institucions = widgets.HTML(
    value="<b>Trieu una o més institucions:</b>"
)

# Create the widget for multiple selection of institutions
institucions_widget = widgets.SelectMultiple(
    options=options_institucions,
    value=[],
    description='Institutions:',
    disabled=False
)
# Function to save selected institutions and print them
opcions = set()
def save_selection_institucions(change):
    global opcions
    selected = set(change['new'])

    if 'All institutions' in selected:
        opcions = set(institucions)  # Select all institutions if 'All institutions' is chosen
    else:
        opcions.update(selected)  # Update the set with new selections

    print(f"Institutions: {list(opcions)}")

# Function to clear the selection and reset the institutions widget
def restart_selection_institucions(button):
    global opcions
    institucions_widget.value = []  # Clear the selections in the widget
    opcions.clear()  # Clear the global opcions set
    print("La selecció de la institució s'ha restablert.")

# Observe changes in the widget selection for institutions
institucions_widget.observe(save_selection_institucions, names='value')

# Create a button to restart the selection for institutions
restart_button_institucions = widgets.Button(description="Reiniciar la selecció d'institucions")
restart_button_institucions.on_click(restart_selection_institucions)

# Display the widgets and button with instructions

display(instruction_text_institucions, institucions_widget, restart_button_institucions)

In [None]:
# @title Once the institution is selected, click the &#x25B6; button to generate the data.</p> Finally, Click <strong>Download file</strong> button.
from collections import defaultdict
import pandas as pd
filemetadata_keys=[]
filemetadata_values=[]
list_doi = []
instancia = []
name_readme_file=[]
validation_readme_file=[]
published=[]
for element in opcions:
    processor = DatasetProcessor(config, element)
    processor.create_list_datasets(element)
    sigles=element
    for i in processor.get_list_datasets():
        #  Metadata lists:
        data_keys=[]
        data_values=[]
        stateDataset = []
        # Exporting metadata
        export_metadata(config.get_api_url(), config.get_token(), i, data_keys, data_values, stateDataset)
        if len(data_values) != 0:
          published.append('Published')
        else:
          published.append('Draft')
        # Creating a DataFrame
        filemetadata_keys_aux=[]
        filemetadata_values_aux=[]
        name_readme_file_aux=[]
        filemetadata(base_url, token, i, filemetadata_keys_aux, filemetadata_values_aux)
        filemetadata_keys.extend(filemetadata_keys_aux) # Change here: Use extend to add elements to the list
        filemetadata_values.extend(filemetadata_values_aux) # Change here: Use extend to add elements to the list
        instancia.append(sigles)
        list_doi.append(i)
        # Iterate through filemetadata extracted for the current dataset
        for file_keys, file_values in zip(filemetadata_keys_aux, filemetadata_values_aux):
            # Check if 'filename' is in the metadata keys
            if 'filename' in file_keys:
                # Get the index of 'filename' in the keys list
                filename_index = file_keys.index('filename')
                # Check if the corresponding value contains 'Readme' or 'readme'
                if 'readme' in file_values[filename_index].lower():
                    name_readme_file_aux.append(file_values[filename_index])
        name_readme_file.append(name_readme_file_aux)
        if len(name_readme_file_aux)!=0:
          validation_readme_file.append('Yes')
        else:
          validation_readme_file.append('No')
# Create the data dictionary
data = {
    'DOI': list_doi,
    'Published': published,
    'Institution': instancia,
    'Is there Readme?': validation_readme_file,
    'Readme file name': name_readme_file
}
# Create the DataFrame
df = pd.DataFrame(data)

# Extract the numeric part of the DOI and convert it to int for sorting
df['DOI_Number'] = df['DOI'].str.extract(r'data(\d+)').astype(int)

# Sort the DataFrame based on the DOI_Number column
df = df.sort_values(by='DOI_Number')

# Format the DOI column as 'https://doi.org/10.34810/dataXXX'
df['DOI'] = 'https://doi.org/10.34810/data' + df['DOI_Number'].astype(str)

# Drop the DOI_Number column (optional)
df = df.drop(columns=['DOI_Number'])
# Drop the []
df['Readme file name'] = df['Readme file name'].astype(str).str.strip("[]").str.replace("'", "", regex=False)
from google.colab import files
# Save the DataFrame to an Excel file
excel_filename = 'datasets_sizes.xlsx'
df.to_excel(excel_filename, index=False)

# Display the DataFrame
display(df)

if IN_COLAB:
    # Running in Google Colab
    # Provide a download button
    download_button = widgets.Button(description="Download file")
    display(download_button)

    # Function to be executed when the download button is clicked
    def on_download_button_click(b):
        # Download the Readme.txt file in Google Colab
        files.download(excel_filename)

    # Event handler for the download button
    download_button.on_click(on_download_button_click)

else:
    # Running in Jupyter Notebook
    # Provide a download link
    download_link = FileLink(excel_filename, result_html_prefix="Click to download the file: ")
    display(download_link)