# Upload a data package

This notebook will help you understand the basics of creating a data package with a few files and simple annotation using the DataONE Python client.

In [None]:
import datetime
import hashlib
import io
import itertools
import os
import uuid
import xml.dom.minidom
from pathlib import Path
from typing import Optional

import d1_common.const
from d1_client.mnclient_2_0 import MemberNodeClient_2_0
from d1_common.types import dataoneTypes
from d1_common.resource_map import createSimpleResourceMap

In [2]:
def flatten_filepath(path: os.PathLike):
    """Replace directory separators with double underscores."""
    return str(path).replace(os.path.sep, '__')


def pretty_format_xml(x):
    dom = xml.dom.minidom.parseString(x)
    return dom.toprettyxml()


def generate_system_metadata(
    pid: str, format_id: str, science_object: bytes, orcid: str, filename: Optional[str] = None
):
    """
    Generates a system metadata document.
    :param pid: The pid that the object will have
    :param format_id: The format of the object (e.g text/csv)
    :param science_object: The object that is being described
    :return:
    """

    # Check that the science_object is unicode, attempt to convert it if it's a str
    if not isinstance(science_object, bytes):
        if isinstance(science_object, str):
            science_object = science_object.encode("utf-8")
        else:
            raise ValueError("Supplied science_object is not unicode")

    size = len(science_object)
    md5 = hashlib.md5()
    md5.update(science_object)
    md5 = md5.hexdigest()
    now = datetime.datetime.now()

    sys_meta = dataoneTypes.systemMetadata()
    sys_meta.identifier = str(pid)
    sys_meta.formatId = format_id
    sys_meta.size = size
    sys_meta.rightsHolder = orcid
    if filename:
        sys_meta.fileName = filename
    
    sys_meta.checksum = dataoneTypes.checksum(str(md5))
    sys_meta.checksum.algorithm = "MD5"
    sys_meta.dateUploaded = now
    sys_meta.dateSysMetadataModified = now
    sys_meta.accessPolicy = generate_public_access_policy()
    
    return sys_meta


def generate_public_access_policy():
    """
    Creates the access policy for the object. Note that the permission is set to 'read'.
    """

    accessPolicy = dataoneTypes.accessPolicy()
    accessRule = dataoneTypes.AccessRule()
    accessRule.subject.append(d1_common.const.SUBJECT_PUBLIC)
    permission = dataoneTypes.Permission("read")
    accessRule.permission.append(permission)
    accessPolicy.append(accessRule)
    return accessPolicy


def create_minimum_eml(title: str) -> bytes:
    """
    Ugly method that creates a bare minimum EML record for a package.
    This includes the title, creator, and contact. Ideally the EML shouldn't need
    to be generated in python.

    :param tale: The tale that is being packaged.
    :return: The EML document
    """

    # XML declaration
    top = '<?xml version="1.0" encoding="UTF-8"?>'
    namespace = '<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:stmml="http://www.xml-cml.org/schema/stmml-1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packageId="test_pkg" system="test_system" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 eml.xsd">'

    dataset = "<dataset>"

    # The uploader's surname
    individualName = "<individualName><surName>Test user</surName></individualName>"

    # Create an EML creator record
    creator = f"<creator>{individualName}</creator>"

    # Create an EML contact record
    contact = f"<contact>{individualName}</contact>"
    dataset_close = "</dataset>"
    eml_close = "</eml:eml>"

    # Append the above xml strings together to form the EML document
    xml = (
        top
        + namespace
        + dataset
        + f"<title>{title}</title>"
        + creator
        + contact
        + dataset_close
        + eml_close
    )

    return xml

# Create test assets

Generate a few small data files to include in your data package.


```
tmp1
├── file1.txt
└── file2.txt
tmp2
├── file1.txt
└── file2.txt
```

Note that since a data package cannot represent a directory hierarchy, we will use `flatten_filepath` to "flatten" each path, i.e., `tmp1/file1.txt` becomes `tmp1__file1.txt`.

We also include the format of each file. In this case, the data are all text files with the format `text/plain`.

In [3]:
for directory_name in ("tmp1", "tmp2"):
    (directory := Path(directory_name)).mkdir(exist_ok=True)
    for filename in ("file1.txt", "file2.txt"):
        (directory / filename).write_text(f"This is {filename} in directory {directory_name}")

files = [
    {'path': path, 'format': 'text/plain'} for path in sorted(itertools.chain(Path('tmp1').rglob('*'), Path('tmp2').rglob('*')))
    
]

files

[{'path': PosixPath('tmp1/file1.txt'), 'format': 'text/plain'},
 {'path': PosixPath('tmp1/file2.txt'), 'format': 'text/plain'},
 {'path': PosixPath('tmp2/file1.txt'), 'format': 'text/plain'},
 {'path': PosixPath('tmp2/file2.txt'), 'format': 'text/plain'}]

# Create the data package

First, replace the following variable values:

- `orcid`
- `auth_token`: Your [JWT](https://jwt.io/) that you get from your DataONE profile page

By default this notebook will generate a data package title "Test data package \<abc\>" where `<abc>` will be a random suffix.

In [4]:
# Set your ORCID
orcid: str = "http://orcid.org/0000-0002-1756-2128"

# Provide an authentication token
auth_token: str = ""

# Set the token in the request header
options: dict = {"headers": {"Authorization": "Bearer " + auth_token}}

# Create the Member Node Client
client: MemberNodeClient_2_0 = MemberNodeClient_2_0("https://dev.nceas.ucsb.edu/knb/d1/mn", **options)

title: str = f"""Test data package {"".join(uuid.uuid4().hex[:8])}"""

## Create and upload the EML

In [5]:
eml_pid = str(uuid.uuid4())
eml = create_minimum_eml(title)
eml_bytes = eml.encode("utf-8")
meta_sm = generate_system_metadata(
    pid=eml_pid,
    format_id="eml://ecoinformatics.org/eml-2.1.1",
    science_object=eml_bytes,
    orcid=orcid,
)
print(f"Creating EML with PID {eml_pid}")
client.create(eml_pid, eml_bytes, meta_sm)

Creating EML with PID 1efeefaa-5c67-4b1e-a384-639ceac512a9


<d1_common.types.generated.dataoneTypes_v1.Identifier at 0x7fdb01971450>

Here's the science metadata

In [6]:
print(pretty_format_xml(eml))

<?xml version="1.0" ?>
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:stmml="http://www.xml-cml.org/schema/stmml-1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packageId="test_pkg" system="test_system" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 eml.xsd">
	<dataset>
		<title>Test data package ba2943fb</title>
		<creator>
			<individualName>
				<surName>Test user</surName>
			</individualName>
		</creator>
		<contact>
			<individualName>
				<surName>Test user</surName>
			</individualName>
		</contact>
	</dataset>
</eml:eml>



And the system metadata for that science metadata:

In [7]:
print(pretty_format_xml(meta_sm.toxml()))

<?xml version="1.0" ?>
<ns1:systemMetadata xmlns:ns1="http://ns.dataone.org/service/types/v2.0">
	<identifier>1efeefaa-5c67-4b1e-a384-639ceac512a9</identifier>
	<formatId>eml://ecoinformatics.org/eml-2.1.1</formatId>
	<size>538</size>
	<checksum algorithm="MD5">769f24d6149c3f22316522c437dd5fd6</checksum>
	<rightsHolder>http://orcid.org/0000-0002-2661-8912</rightsHolder>
	<accessPolicy>
		<allow>
			<subject>public</subject>
			<permission>read</permission>
		</allow>
	</accessPolicy>
	<dateUploaded>2025-01-24T17:06:09.677176</dateUploaded>
	<dateSysMetadataModified>2025-01-24T17:06:09.677176</dateSysMetadataModified>
</ns1:systemMetadata>



## Create and upload the data

In [8]:
data_pids = []
for data in files:
    data_pid = uuid.uuid4().hex
    data_pids.append(data_pid)
    with Path(data["path"]).open("rb") as fp:
        data_bytes = fp.read()
    data_sm = generate_system_metadata(
        pid=data_pid,
        format_id=data["format"],
        science_object=data_bytes,
        orcid=orcid,
        filename=flatten_filepath(data["path"])
    )
    print(f"""Creating data {data["path"]} with PID {data_pid}""")
    client.create(data_pid, data_bytes, data_sm)

Creating data tmp1/file1.txt with PID 66d34b2dfd79419bbf2fc6f7b2e77f7b
Creating data tmp1/file2.txt with PID b9bcab58d28445618a4f67af8dcc16e3
Creating data tmp2/file1.txt with PID 722b7d2245bd49459e18b8f56ce63f91
Creating data tmp2/file2.txt with PID ec091619cd82476b96a61f4431046754


Let's look at the last file's system metadata as an example:

In [9]:
print(pretty_format_xml(data_sm.toxml()))

<?xml version="1.0" ?>
<ns1:systemMetadata xmlns:ns1="http://ns.dataone.org/service/types/v2.0">
	<identifier>ec091619cd82476b96a61f4431046754</identifier>
	<formatId>text/plain</formatId>
	<size>35</size>
	<checksum algorithm="MD5">ca6101b8d25b3266271b9fa2d1892a3d</checksum>
	<rightsHolder>http://orcid.org/0000-0002-2661-8912</rightsHolder>
	<accessPolicy>
		<allow>
			<subject>public</subject>
			<permission>read</permission>
		</allow>
	</accessPolicy>
	<dateUploaded>2025-01-24T17:06:13.536165</dateUploaded>
	<dateSysMetadataModified>2025-01-24T17:06:13.536165</dateSysMetadataModified>
	<fileName>tmp2__file2.txt</fileName>
</ns1:systemMetadata>



## Create and upload the resource map

In [10]:
ore_pid = str(uuid.uuid4())
ore = createSimpleResourceMap(ore_pid, eml_pid, data_pids)
ore_meta = generate_system_metadata(
    pid=ore_pid,
    format_id="http://www.openarchives.org/ore/terms",
    science_object=ore.serialize(format="xml"),
    orcid=orcid,
)
print(f"Creating resource map with PID {ore_pid}")
client.create(ore_pid, ore.serialize(format="xml"), ore_meta)

Creating resource map with PID dac060d6-cb9e-48bd-b587-d84c0fe5758d


<d1_common.types.generated.dataoneTypes_v1.Identifier at 0x7fdb017eae40>

Here's the resource map:

In [12]:
print(pretty_format_xml(ore.serialize(format="xml")))

<?xml version="1.0" ?>
<rdf:RDF xmlns:cito="http://purl.org/spar/cito/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:ore="http://www.openarchives.org/ore/terms/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
	
  
	<rdf:Description rdf:about="https://cn.dataone.org/cn/v2/resolve/1efeefaa-5c67-4b1e-a384-639ceac512a9">
		
    
		<ore:isAggregatedBy rdf:resource="https://cn.dataone.org/cn/v2/resolve/dac060d6-cb9e-48bd-b587-d84c0fe5758d#aggregation"/>
		
    
		<dcterms:identifier>1efeefaa-5c67-4b1e-a384-639ceac512a9</dcterms:identifier>
		
    
		<cito:documents rdf:resource="https://cn.dataone.org/cn/v2/resolve/66d34b2dfd79419bbf2fc6f7b2e77f7b"/>
		
    
		<cito:documents rdf:resource="https://cn.dataone.org/cn/v2/resolve/b9bcab58d28445618a4f67af8dcc16e3"/>
		
    
		<cito:documents rdf:resource="https://cn.dataone.org/cn/v2/resolve/722b7d2245bd49459e18b8f56ce63f91"/>
		
    
		<cito:documents rdf:resource="https://cn.dataon

And the resource map system metadata:

In [13]:
print(pretty_format_xml(ore_meta.toxml()))

<?xml version="1.0" ?>
<ns1:systemMetadata xmlns:ns1="http://ns.dataone.org/service/types/v2.0">
	<identifier>dac060d6-cb9e-48bd-b587-d84c0fe5758d</identifier>
	<formatId>http://www.openarchives.org/ore/terms</formatId>
	<size>4317</size>
	<checksum algorithm="MD5">0fcf2a4759669df3f9739c0d78e9c1ff</checksum>
	<rightsHolder>http://orcid.org/0000-0002-2661-8912</rightsHolder>
	<accessPolicy>
		<allow>
			<subject>public</subject>
			<permission>read</permission>
		</allow>
	</accessPolicy>
	<dateUploaded>2025-01-24T17:06:14.700613</dateUploaded>
	<dateSysMetadataModified>2025-01-24T17:06:14.700613</dateSysMetadataModified>
</ns1:systemMetadata>

