## Uploading Files via DataverseNL API

In [None]:
The following notebook is intended to wlk you throuth 

https://pydataverse.readthedocs.io/

In [2]:
#!pip install -U pyDataverse

Connecting to Native API

In [8]:
from pyDataverse.api import NativeApi

BASE_URL = 'https://dataverse.nl'
API_TOKEN = 'b849fb34-9e03-45a7-8f9b-7d722bc26d16'

api = NativeApi(BASE_URL, API_TOKEN)

checking the connection

In [11]:
resp = api.get_info_version()
print(resp.status_code, resp.json())

200 {'status': 'OK', 'data': {'version': '5.6', 'build': 'dans-develop-68e6bbd70'}}


### Create a Dataverse collection
(we skip that for now)

## Create a Dataset

In [13]:
# creating an dempty dataverse object 
from pyDataverse.models import Dataset
ds = Dataset()

In [15]:
from pyDataverse.utils import read_file

Example from https://github.com/gdcc/pyDataverse

In [17]:
ds_filename = "dataset.json"
ds.from_json(read_file(ds_filename))

In [19]:
# this is metadata
ds.get()

{'citation_displayName': 'Citation Metadata',
 'title': 'Youth in Austria 2005',
 'author': [{'authorName': 'LastAuthor1, FirstAuthor1',
   'authorAffiliation': 'AuthorAffiliation1'}],
 'datasetContact': [{'datasetContactEmail': 'ContactEmail1@mailinator.com',
   'datasetContactName': 'LastContact1, FirstContact1'}],
 'dsDescription': [{'dsDescriptionValue': 'DescriptionText'}],
 'subject': ['Medicine, Health and Life Sciences']}

In [20]:
# validates the queality of the metadata
ds.validate_json()

True

In [21]:
# updating an entry of the metadata
ds.get()["title"]

'Youth in Austria 2005'

In [22]:
ds.set({"title": "Youth in the Netherlands 2005"})
ds.get()["title"]

'Youth in the Netherlands 2005'

https://dataverse.nl/dataverse/IDS

In [23]:
resp = api.create_dataset("IDS", ds.json())

Dataset with pid 'doi:10.34894/7RAGMX' created.


In [24]:
resp.json()

{'status': 'OK', 'data': {'id': 201008, 'persistentId': 'doi:10.34894/7RAGMX'}}

In [25]:
ds_pid = resp.json()["data"]["persistentId"]
ds_pid = 'doi:10.34894/7RAGMX'

'doi:10.34894/7RAGMX'

In [26]:
ds_pid = 'doi:10.34894/7RAGMX'

'doi:10.34894/7RAGMX'

## Upload a Datafile

In [27]:
from pyDataverse.models import Datafile
df = Datafile()

import your metadata with from_json(). Then, set your PID and filename manually (set()), as they are required as metadata for the upload and are created during the import process:

In [28]:
df

<pyDataverse.models.Datafile at 0x7ff6a9d6dfa0>

In [29]:
# the file is living in this same directory
df_filename = "datafile.txt"
# define dataframe (datafile) properties
df.set({"pid": ds_pid, "filename": df_filename})

# visualize
df.get()


{'pid': 'doi:10.34894/7RAGMX', 'filename': 'datafile.txt'}

In [33]:
# id, data, metadata
resp = api.upload_datafile(ds_pid, df_filename, df.json())
print(resp.json())

{'status': 'OK', 'data': {'files': [{'description': '', 'label': 'datafile.txt', 'restricted': False, 'version': 1, 'datasetVersionId': 10573, 'dataFile': {'id': 201015, 'persistentId': '', 'pidURL': '', 'filename': 'datafile.txt', 'contentType': 'text/plain', 'filesize': 7, 'description': '', 'storageIdentifier': 'file://17d96570db5-0a44146c7d33', 'rootDataFileId': -1, 'md5': '8b8db3dfa426f6bdb1798d578f5239ae', 'checksum': {'type': 'MD5', 'value': '8b8db3dfa426f6bdb1798d578f5239ae'}, 'creationDate': '2021-12-07'}}]}}


In [30]:
api.upload_datafile?

[0;31mSignature:[0m [0mapi[0m[0;34m.[0m[0mupload_datafile[0m[0;34m([0m[0midentifier[0m[0;34m,[0m [0mfilename[0m[0;34m,[0m [0mjson_str[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mis_pid[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add file to a dataset.

Add a file to an existing Dataset. Description and tags are optional:

HTTP Request:

.. code-block:: bash

    POST http://$SERVER/api/datasets/$id/add

The upload endpoint checks the content of the file, compares it with
existing files and tells if already in the database (most likely via
hashing).

`adding-files <http://guides.dataverse.org/en/latest/api/native-api.html#adding-files>`_.

Parameters
----------
identifier : str
    Identifier of the dataset.
filename : str
    Full filename with path.
json_str : str
    Metadata as JSON string.
is_pid : bool
    ``True`` to use persistent identifier. ``False``, if not.

Returns
-------
dict
    The json string responde

By uploading the Datafile, the attached Dataset gets an update. This means that a new unpublished Dataset version is created as a draft and the change is not yet publicly available. To make it available through creating a new Dataset version, publish the Dataset with publish_dataset(). Again, set the release_type="major" to create version 2.0, as a file change always leads to a major version change:

### This one definitely publishes the dataset

In [34]:
resp = api.publish_dataset(ds_pid, release_type="major")
print(resp)

Dataset doi:10.34894/7RAGMX published
<Response [200]>


## Download and save a dataset to disk

#### IMPORTANT
Note that if the dataset is public, you don’t need to have an API_TOKEN. Furthermore, you don’t even need to have a Dataverse account to use this functionality. The code would therefore look as follows:

However, you need to know the DOI of the dataset that you want to download. In this example, we use doi:10.34894/LRBQF5 (Lea data), which is hosted on Dataverse NL instance that we specified as base_url. The code looks as follows:

In [36]:
from pyDataverse.api import NativeApi, DataAccessApi
from pyDataverse.models import Dataverse

BASE_URL = 'https://dataverse.nl'

api = NativeApi(BASE_URL)
data_api = DataAccessApi(BASE_URL)

In [37]:
DOI = "doi:10.34894/LRBQF5"
dataset = api.get_dataset(DOI)

In [39]:
files_list = dataset.json()['data']['latestVersion']['files']

for file in files_list:
    filename = file["dataFile"]["filename"] #because we want o name the file exactly how it is in dataverse
    file_id = file["dataFile"]["id"]
    print("File name {}, id {}".format(filename, file_id))
    # Here we are getting the data
    response = data_api.get_datafile(file_id)
    with open(filename, "wb") as f:
        f.write(response.content)

File name All_wow_aggregated_illustrations.csv, id 47243
File name get_bhl_images_resize.py, id 47245
File name search_aggregate_03.py, id 47244


This is wow we save data

## Retrieve data as Dataverse tree

In [41]:
# Fasos example (put the dataverse)
tree = api.get_children("MUSTS", children_types= ["datasets", "datafiles"])
tree

[{'dataset_id': 8586,
  'pid': 'doi:10.34894/DR3I2A',
  'type': 'dataset',
  'children': []},
 {'dataset_id': 36716,
  'pid': 'doi:10.34894/LRBQF5',
  'type': 'dataset',
  'children': [{'datafile_id': 47243,
    'filename': 'All_wow_aggregated_illustrations.csv',
    'label': 'All_wow_aggregated_illustrations.csv',
    'pid': '',
    'type': 'datafile'},
   {'datafile_id': 47245,
    'filename': 'get_bhl_images_resize.py',
    'label': 'get_bhl_images_resize.py',
    'pid': '',
    'type': 'datafile'},
   {'datafile_id': 47244,
    'filename': 'search_aggregate_03.py',
    'label': 'search_aggregate_03.py',
    'pid': '',
    'type': 'datafile'}]}]

In [42]:
tree = api.get_children("IDS", children_types= ["datasets", "datafiles"])
tree

[{'dataset_id': 201008,
  'pid': 'doi:10.34894/7RAGMX',
  'type': 'dataset',
  'children': [{'datafile_id': 201015,
    'filename': 'datafile.txt',
    'label': 'datafile.txt',
    'pid': '',
    'type': 'datafile'}]}]

In [44]:
tree = api.get_children("maastricht", children_types= ["datasets", "datafiles"])
tree

[]

In [45]:
api.get_children?

[0;31mSignature:[0m
[0mapi[0m[0;34m.[0m[0mget_children[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mparent[0m[0;34m=[0m[0;34m':root'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mparent_type[0m[0;34m=[0m[0;34m'dataverse'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mchildren_types[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mauth[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Walk through children of parent element in Dataverse tree.

Default: gets all child dataverses if parent = dataverse or all

Example Dataverse Tree:

.. code-block:: bash

    data = {
        'type': 'dataverse',
        'dataverse_id': 1,
        'dataverse_alias': ':root',
        'children': [
            {
                'type': 'datasets',
                'dataset_id': 231,
                'pid': 'doi:10.11587/LYFDYC',
                'children': [
                    {
                

## Clean up and remove all created data

ds_piddoi:10.34894/7RAGMX

In [48]:
ds_pid

'doi:10.34894/7RAGMX'

In [51]:
resp = api.destroy_dataset(ds_pid)

In [50]:
api.get_dataset(ds_pid)

<Response [200]>