This implementation is mostly taken from https://github.com/AUSSDA/pyDataverse_demo_tromso/blob/master/pydataverse.ipynb

Atuhor: Matthieu Pons (pons.matthieu@gmail.com or https://github.com/mpons for support)

In [1]:
# load Python modules
import json
import os
import subprocess as sp
import time
import pandas as pd

from pyDataverse.api import Api, NativeApi
from pyDataverse.models import Datafile, Dataset
from pyDataverse.utils import read_file_csv_to_dict
from pyDataverse.utils import read_file

from config import LOCAL_RESSOURCES_FOLDER, DV_ALIAS, BASE_URL, API_TOKEN


In [2]:
def parse_dataset_keys(dataset_row, data, terms_filename):
    if pd.isnull(dataset_row['organization.dataset_id']):
        return data
    
    ds_tmp = {}
    ds_id = dataset_row['organization.dataset_id']
    ds_tmp['termsOfAccess'] = read_file(terms_filename)
    
    if not pd.isnull(dataset_row['dataverse.title']):
        ds_tmp['title'] = dataset_row['dataverse.title']
    
    if not pd.isnull(dataset_row['dataverse.subtitle']):
        ds_tmp['subtitle'] = dataset_row['dataverse.subtitle']
    
    if not pd.isnull(dataset_row['dataverse.author']):
        ds_tmp['author'] = json.loads(dataset_row['dataverse.author'])
    
    if not pd.isnull(dataset_row['dataverse.dsDescription']):
        ds_tmp['dsDescription'] = [{'dsDescriptionValue': dataset_row['dataverse.dsDescription']}]
    
    if not pd.isnull(dataset_row['dataverse.keywordValue']):
        ds_tmp['keyword'] = json.loads(dataset_row['dataverse.keywordValue'])
    
    if not pd.isnull(dataset_row['dataverse.topicClassification']):
        ds_tmp['topicClassification'] = json.loads(dataset_row['dataverse.topicClassification'])
    
    if not pd.isnull(dataset_row['dataverse.language']):
        ds_tmp['language'] = json.loads(dataset_row['dataverse.language'])
    
    if not pd.isnull(dataset_row['dataverse.subject']):
        ds_tmp['subject'] = [dataset_row['dataverse.subject']]
    
    if not pd.isnull(dataset_row['dataverse.kindOfData']):
        ds_tmp['kindOfData'] = json.loads(dataset_row['dataverse.kindOfData'])
    
    if not pd.isnull(dataset_row['dataverse.datasetContact']):
        ds_tmp['datasetContact'] = json.loads(dataset_row['dataverse.datasetContact'])
    
    data[ds_id] = {'metadata': ds_tmp}
    return data

In [3]:
def import_datafile(datafile_row, data):
    df_tmp = {}
    df_id = None
    ds_id = None
    if not pd.isnull(datafile_row['dataverse.description']):
        df_tmp['description'] = datafile_row['dataverse.description']
        
    if not pd.isnull(datafile_row['organization.filename']):
        df_tmp['filename'] = datafile_row['organization.filename']
    if not pd.isnull(datafile_row['organization.datafile_id']):
        df_tmp['datafile_id'] = datafile_row['organization.datafile_id']
        df_id = datafile_row['organization.datafile_id']
    if not pd.isnull(datafile_row['organization.dataset_id']):
        ds_id = datafile_row['organization.dataset_id']
        df_tmp['dataset_id'] = ds_id
    if not pd.isnull(datafile_row['dataverse.categories']):
        df_tmp['categories'] = json.loads(datafile_row['dataverse.categories'])
        
    if 'datafiles' not in data[ds_id]:
        data[ds_id]['datafiles'] = {}
    if df_id not in data[ds_id]['datafiles']:
        data[ds_id]['datafiles'][df_id] = {}
    if 'metadata' not in data[ds_id]['datafiles'][df_id]:
        data[ds_id]['datafiles'][df_id]['metadata'] = {}
    data[ds_id]['datafiles'][df_id]['metadata'] = df_tmp
    return data

In [4]:
def create_dataset(api, ds, dv_alias, mapping_dsid2pid, ds_id, base_url):
    try:
        resp = api.create_dataset(dv_alias, ds.json())
        pid = resp.json()['data']['persistentId']
    except:
        print(resp.content)
        return resp, mapping_dsid2pid
    
    mapping_dsid2pid[ds_id] = pid
    time.sleep(1)
    print('{0}/dataset.xhtml?persistentId={1}&version=DRAFT'.format(base_url,
                                                                    pid))
    return resp, mapping_dsid2pid

In [5]:
def upload_datafile(api, pid, filename, df):
    path = api.base_url
    path += '/datasets/:persistentId/add?persistentId={0}'.format(pid)
    shell_command = 'curl -H "X-Dataverse-key: {0}"'.format(api.api_token)
    shell_command += ' -X POST {0} -F file=@{1}'.format(path, filename)
    shell_command += " -F 'jsonData={0}'".format(df.json())
    result = sp.run(shell_command, shell=True, stdout=sp.PIPE)
    if filename[-4:] == '.sav' or filename[-4:] == '.dta':
        time.sleep(20)
    else:
        time.sleep(2)
    return result

In [6]:
def delete_dataset(pid, api):
    resp = api.delete_dataset(pid)
    time.sleep(1)
    return resp

In [7]:
def publish_dataset(pid, api):
    resp = api.publish_dataset(pid, 'major')
    print(resp.json())
    return resp

In [9]:
ds_filename = os.path.join(LOCAL_RESSOURCES_FOLDER, 'datasets.csv')
license_filename = os.path.join(LOCAL_RESSOURCES_FOLDER, 'license.html')
terms_filename = os.path.join(LOCAL_RESSOURCES_FOLDER, 'terms-of-access.html')

data = {}
license_default = read_file(license_filename)
datasets_csv = read_file_csv_to_dict(ds_filename)

In [10]:
# Import Datasets metadata from CSV file and save it in a dictionary
datasets_df = pd.read_csv(ds_filename)
data = {}
for dataset_row in datasets_df.iterrows():
    data = parse_dataset_keys(dataset_row[1], data, terms_filename)

In [11]:
native_api = NativeApi(BASE_URL, API_TOKEN)

In [12]:
mapping_dsid2pid = {}

for ds_id, dataset in data.items():
    ds = Dataset()
    ds.set(dataset['metadata'])
    
    ds.displayName=dataset['metadata']['title']
    
    resp, mapping_dsid2pid = create_dataset(native_api, ds, DV_ALIAS, mapping_dsid2pid, ds_id, BASE_URL)

Dataset with pid 'doi:10.5072/FK2/M92KZV' created.
http://datasets.coronawhy.org/dataset.xhtml?persistentId=doi:10.5072/FK2/M92KZV&version=DRAFT
Dataset with pid 'doi:10.5072/FK2/SIQOBX' created.
http://datasets.coronawhy.org/dataset.xhtml?persistentId=doi:10.5072/FK2/SIQOBX&version=DRAFT


In [13]:
df_filename = os.path.join(LOCAL_RESSOURCES_FOLDER, 'datafiles.csv')
datafiles_df = pd.read_csv(df_filename)

In [14]:
for datafile_row in datafiles_df.iterrows():
    data = import_datafile(datafile_row[1], data)

In [15]:
# upload Datafile metadata and data via API

for ds_id, dataset in data.items():
    pid = mapping_dsid2pid[ds_id]
    for df_id, datafile in dataset['datafiles'].items():
        data_tmp = datafile['metadata']
        data_tmp['pid'] = pid
        df = Datafile()
        df.set(data_tmp)
        filename = os.path.abspath(os.path.join('dataverse', 'files', datafile['metadata']['filename']))
        resp = upload_datafile(native_api, pid, filename, df)

In [16]:
# Delete the Datasets at the End (OPTIONAL)
DELETE_DATASETS = DV_ALIAS == 'demo'

if DELETE_DATASETS:
    for ds_id, dataset in data.items():
        resp = delete_dataset(mapping_dsid2pid[ds_id], native_api)

Dataset 'doi:10.5072/FK2/M92KZV' deleted.
Dataset 'doi:10.5072/FK2/SIQOBX' deleted.
