<a href="https://colab.research.google.com/github/CSUC/RDR-scripts/blob/main/upload_files/upload_files_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script for Uploading Files Automatically

If you have doubts about the code, contact rdr-contacte@csuc.cat

## Script Objective
The main objective of this script is to automatically upload files to a dataset with their respective metadata placed in an Excel file.

## Script Observation

Place the script files and the files to be uploaded to the dataset in the same folder on your computer.

### In COLAB
Upload the Excel file to the Upload files icon.

### To generate the Excel and make the script work correctly, you must follow these requirements:

- The first row is the header and must contain the name of the variables in this order:
    - File Name
    - Description
    - File Path
    - Tag
- Each row corresponds to a file.
- The file name (File Name) is the only mandatory metadata.
    - It must be written correctly and include its extension.
- In case any cell does not contain information, it should be left blank.
- In case the metadata contains a number, it should be written within quotes.
- In the tags variable (Tag), if multiple tags are desired, they should be written separated by a comma.


In [None]:
# @title Enter last digits of DOI, the token and the name of the excel file with the extension .xlsx and the repository URL. Click the execute button cell &#x25B6;
import os
import subprocess
import sys

# Function to install required packages
def install_packages():
    """
    Function to install or update necessary Python packages.
    """
    # Upgrade pip first
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip", "-q"])

    # Install the required libraries
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyDataverse", "-q"])

    print("Libraries have been downloaded or updated.")

# Install libraries if they are not installed already
try:
    import pyDataverse
except ImportError:
    print("Installing libraries...")
    install_packages()

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

import ipywidgets as widgets
from IPython.display import display, FileLink
from google.colab import files
from IPython.display import display, HTML

import pandas as pd
from pyDataverse.api import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse, Datafile
from pathlib import Path

# Provide input values
identifier = ""  # @param {type:"string"}
token = ""  # @param {type:"string"}
excel_file_name = "" # @param {type:"string"}
base_url = 'https://dataverse.csuc.cat/'  # @param {type:"string"}
doi='doi:10.34810/data'+identifier
# Initialize API
api = NativeApi(base_url, token)
data_api = DataAccessApi(base_url, token)

def upload_files(base_url, token, doi, excel_file_name):
    """
    Function to upload files to a dataset based on metadata provided in an Excel file.

    Args:
    base_url (str): Base URL of the repository.
    token (str): API token for authentication.
    doi (str): DOI of the dataset.
    excel_file_name (str): Name of the Excel file with metadata.

    Returns:
    None
    """
    try:
        # Read metadata from Excel file
        files_metadata = pd.read_excel(excel_file_name).to_numpy().tolist()
        verifier = True
        for i in range(len(files_metadata)):
            file_name = files_metadata[i][0]
            path = Path(file_name)
            if not path.is_file():
                print('File not found: ' + file_name)
                verifier = False
        if verifier:
            try:
                dataset = api.get_dataset(doi)
                for i in range(len(files_metadata)):
                    df = Datafile()
                    df.set({'pid': doi})
                    file_name = files_metadata[i][0]
                    df.set({'filename': file_name})
                    if type(files_metadata[i][1]) != float:
                        file_description = files_metadata[i][1]
                        df.set({'description': file_description})
                    if type(files_metadata[i][2]) != float:
                        file_path = files_metadata[i][2]
                        df.set({'directoryLabel': file_path})
                    if type(files_metadata[i][3]) != float:
                        file_categories = files_metadata[i][3].split(",")
                        df.set({'categories': file_categories})
                    df.get()
                    resp = api.upload_datafile(doi, file_name, df.json())
                    print('File uploaded: ' + file_name)
            except:
                print('Incorrect token or DOI not found: ' + doi)
        else:
            print('No files uploaded. Please modify the file names that are incorrect.')
    except FileNotFoundError:
        print('Metadata file not found: ' + excel_file_name)

# Upload files
upload_files(base_url, token, doi, excel_file_name)

In [None]:
# @title Run to get the size of the dataset. Click the execute button cell &#x25B6;
def filemetadata(base_url, token, doi, filemetadata_keys, filemetadata_values):
    """
    Function to extract metadata for files associated with a dataset identified by its DOI.

    Parameters:
    - base_url: str. Base URL of the Dataverse instance.
    - token: str. API token for authentication.
    - doi: str. DOI of the dataset.
    - filemetadata_keys: list. List to store file metadata keys.
    - filemetadata_values: list. List to store file metadata values.

    Returns:
    - None. Updates the provided lists with extracted file metadata.
    """
    from pyDataverse.api import NativeApi, DataAccessApi
    from pyDataverse.models import Dataverse

    # Instantiate API objects for accessing Dataverse
    api = NativeApi(base_url, token)
    data_api = DataAccessApi(base_url, token)

    try:
        # Retrieve dataset metadata
        dataset = api.get_dataset(doi)

        # Iterate through files and extract metadata
        for i in range(len(dataset.json()['data']['latestVersion']['files'])):
            filemetadata_resp = dataset.json()['data']['latestVersion']['files'][i]['dataFile']
            filemetadata_keys_aux = list(filemetadata_resp.keys())
            filemetadata_values_aux = list(filemetadata_resp.values())
            filemetadata_keys.append(filemetadata_keys_aux)
            filemetadata_values.append(filemetadata_values_aux)
    except KeyError:
        print('There was an error reading metadata for the files of the dataset: ' + doi)
# Get the file metadata
dataset = api.get_dataset(doi)
filemetadata_keys=[]
filemetadata_values=[]
filemetadata(base_url, token, doi, filemetadata_keys, filemetadata_values)
def format_size(size_in_bytes):
    units = ["Bytes", "KB", "MB", "GB", "TB"]
    size = float(size_in_bytes)
    unit_index = 0

    while size >= 1024 and unit_index < len(units) - 1:
        size /= 1024
        unit_index += 1

    return f"{size:.2f} {units[unit_index]}"

# Determine indexes dynamically for each entry
def get_index(key_list, key):
    return key_list.index(key) if key in key_list else None

# Get indices for 'filesize' and 'originalFileSize'
filesize_index = get_index(filemetadata_keys[0], 'filesize')
original_size_index = get_index(filemetadata_keys[-1], 'originalFileSize')

def get_size(entry, key_list):
    original_index = get_index(key_list, 'originalFileSize')
    file_index = get_index(key_list, 'filesize')
    if original_index is not None and isinstance(entry[original_index], int):
        return entry[original_index]
    return entry[file_index]

# Compute total sizes
sizes = [get_size(entry, filemetadata_keys[i]) for i, entry in enumerate(filemetadata_values)]

total_original_size_bytes = sum(sizes)
total_archival_size_bytes = sum(entry[filesize_index] for entry in filemetadata_values)

# Format and print the results
print("Total original format dataset size:", format_size(total_original_size_bytes))
print("Total archival format dataset size:", format_size(total_archival_size_bytes))
