## A workflow for automating the creation of a ScienceBase Data Release from the content of a CSDGM metadata record

#### Created by the Data Managment Team of the Fort Collins Science Center with help from the SB team

In [1]:
import requests
import pysb
import random
import smtplib
import json
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
from requests_ntlm import HttpNtlmAuth

import getpass

from pymdwizard.core.xml_utils import XMLRecord, XMLNode
from pymdwizard.core import utils

### identify the items we'll need for this process:

#### The CSDGM metadata record,  A list of data files to include in the release,  the IPDS number

In [2]:
md_fname = r"Z:\FORT_DataManagement\Falk_2017_PythonBodyCondition\FinalData\Falk 2017 python body condition data.csv.xml"
data_files = [r"Z:\FORT_DataManagement\Falk_2017_PythonBodyCondition\FinalData\Falk 2017 python body condition data.csv", ]
ipds_number = 'IP-087269'

#### get the username and password we'll be using

In [3]:
username = "talbertc@usgs.gov"
password = getpass.getpass()

········


#### extract the information needed from the MD record

In [4]:
md = XMLRecord(md_fname)

In [5]:
title = md.metadata.idinfo.citation.citeinfo.title.text
title

'Sex, length, total mass, fat mass, and specimen condition data for 248 Burmese pythons (Python bivittatus) collected in the Florida Everglades'

In [6]:
abstract = md.metadata.idinfo.descript.abstract.text
abstract

"These data were collected from Burmese pythons removed from the Florida everglades as part of invasive-species management. After euthanasia, we sexed (male or female) and measured the snout-vent length (SVL; cm) and total body mass (g) for each python. We also measured total fat mass (g) by removing all visible fat bodies from the coelomic cavity and weighing this mass. For a subset of specimens, we recorded whether the pythons were put on ice after euthanasia and measured within 24 hours ('fresh') or whether the pythons were frozen after euthanasia, thawed, and then measured ('frozen'). These data were used to validate several body condition indices in Burmese pythons."

In [7]:
pubdate = md.metadata.idinfo.citation.citeinfo.pubdate.text
pubyear = pubdate[:4]
pubdate, pubyear

('2017', '2017')

### Pull the authors and ORCID's from IPDS

In [8]:
ipds_session = requests.Session()
#you will need to change this to your AD username. When prompted enter AD password
ipds_session.auth = HttpNtlmAuth('GS\\{}'.format(username.split('@')[0]), password, ipds_session)

In [9]:
authors_url = "https://ipds.usgs.gov/_vti_bin/listdata.svc/IPDSAuthors()?$filter=IPNumber%20eq%20%27{}%27".format(ipds_number)

content = ipds_session.get(authors_url)
soup = BeautifulSoup(content.text, "lxml-xml")
author_list = []
for entry in soup.find_all('entry'):
    record = {}
    record['author_name'] = entry.find('AuthorNameText').string
    record['ORCID'] = str(entry.find('ORCID').string)
    author_list.append(record)
    
author_list

[{'ORCID': '0000-0002-9690-5626', 'author_name': 'Falk, Bryan G.'},
 {'ORCID': 'None', 'author_name': 'Snow, Ray W'},
 {'ORCID': '0000-0001-8349-6168', 'author_name': 'Reed, Robert'}]

### Make the SB item from our metadata and upload our data to it

In [14]:
sb = pysb.SbSession()
sb.login(username, password)

<pysb.SbSession.SbSession at 0xbddb978>

In [15]:
root_release_id = '552d79bee4b0b22a157f59a3'
root_release_id = '4ff5c8f9e4b03b1a74b5782d' #testing
item_json = sb.upload_file_and_create_item(root_release_id, md_fname)

In [16]:
item_id = item_json['id']
item_id

'594189dee4b0764e6c64a623'

In [17]:
item_json = sb.upload_files_and_update_item(item_json, data_files)

### Create the DOI

In [18]:
rsc_url = 'https://www.sciencebase.gov/catalog/item/' + item_id

r = requests.Session()

# doi_tool_url = 'https://www1.usgs.gov/csas/doi/' # production
doi_tool_url = 'https://www1-staging.snafu.cr.usgs.gov/csas/doi/' #testing

req = r.get(doi_tool_url, verify=False) #
form_csrf  = req.content.decode('utf-8').split('name="_csrf" value="')[1].split('" />\n')[0]
cred = {'j_username': username, 'j_password': password, '_csrf': form_csrf}
response = r.post(doi_tool_url + 'j_spring_security_check', cookies=req.cookies, data=cred, verify=False)


key_username = 'sciencebase@ornl.gov'
key5 = 'usersAndTypes[' + key_username + ']'
value5 = 'PRIMARY'
# specific user credentials, will dup if above already
key6 = 'usersAndTypes[talbertc@usgs.gov]'
value6 = 'PRIMARY'
key7 = 'usersAndTypes[afreeman@usgs.gov]'
value7 = 'PRIMARY'


d = {'identifier':'',
    'title':title,
    'resourceURL': rsc_url,
    key5: value5, key6: value6, key7: value7,
    'addNewCreatorAuthor': '', 
    'addNewCreatorAuthorOrcid': '',
    'authorValidity':'valid',
    'abstract_type_description' : abstract,
    'subject' : '',
    'project_year' : '',
    'project_date' : '',
    'project_start_year' : '',
    'project_end_year' : '',
    'project_start_date' : '',
    'project_end_date' : '',
    'date_type' : '',
    'date' : '',
    'pubYear':str(pubyear), #
    'resourceType': 'Dataset',
    'publisher': 'U.S. Geological Survey',
    'status': 'reserved',
    '_csrf': form_csrf,
    'save': 'Submit'}

for i, author in enumerate(author_list):
    d['authors[{}].authorName'.format(i)] = author['author_name']
    d['authors[{}].orcId'.format(i)] = author['ORCID']
    d['authors[{}].position'.format(i)] = i
    
create_new_doi = r.post(doi_tool_url + 'result.htm', cookies = req.cookies, data = d, verify = False)
create_new_doi



<Response [200]>

In [19]:
new_doi = create_new_doi.content.decode('utf-8').split('Your DOI has been saved: ')[1].split('</div>')[0].replace(" ","").strip()
new_doi

'doi:10.5072/FK22B92D90'

### Update our SB item to include the IPDS link and DOI 

In [20]:
ipds_format = {'type':"IPDS", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':ipds_number}
doi_format = {'type':"DOI", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':new_doi}
item_json = sb.updateSbItem({'id':item_id, 'identifiers':[ipds_format, doi_format]})

### update the SB citation with our DOI

In [21]:
item_json['citation']

'Bryan G. Falk, Invasive Species Science, U.S. Geological Survey, 2017, Sex, length, total mass, fat mass, and specimen condition data for 248 Burmese pythons (Python bivittatus) collected in the Florida Everglades: .'

In [23]:
item_json = sb.get_item(item_id)
citation = item_json['citation']
citation

'Bryan G. Falk, Invasive Species Science, U.S. Geological Survey, 2017, Sex, length, total mass, fat mass, and specimen condition data for 248 Burmese pythons (Python bivittatus) collected in the Florida Everglades: .'

In [27]:
new_citation = "".join(citation.split(':')[:-1]) + ": U.S. Geological Survey data release, https://doi.org/{}.".format(new_doi[4:])
item_json['citation'] = new_citation
new_citation

'Bryan G. Falk, Invasive Species Science, U.S. Geological Survey, 2017, Sex, length, total mass, fat mass, and specimen condition data for 248 Burmese pythons (Python bivittatus) collected in the Florida Everglades: U.S. Geological Survey data release, https://doi.org/10.5072/FK22B92D90.'

In [28]:
item_json['contacts']

[{'contactType': 'person',
  'email': 'bfalk@usgs.gov',
  'jobTitle': 'Research Fellow',
  'name': 'Bryan G Falk',
  'organization': {'displayText': 'U.S. Geological Survey, Southwest Region'},
  'primaryLocation': {'mailAddress': {'city': 'Homestead',
    'country': 'US',
    'line1': 'Daniel Beard Center, Everglades National Park',
    'line2': '40001 SR 9336',
    'state': 'FL',
    'zip': '33034'},
   'officePhone': '305-242-7820'},
  'type': 'Point of Contact'},
 {'name': 'Bryan G. Falk, Invasive Species Science, U.S. Geological Survey',
  'type': 'Originator'},
 {'contactType': 'person',
  'email': 'bfalk@usgs.gov',
  'jobTitle': 'Research Fellow',
  'name': 'Bryan G Falk',
  'organization': {'displayText': 'U.S. Geological Survey, Southwest Region'},
  'primaryLocation': {'mailAddress': {'city': 'Homestead',
    'country': 'US',
    'line1': '40001 SR 9336',
    'state': 'FL',
    'zip': '33034'},
   'officePhone': '305-242-7820'},
  'type': 'Metadata Contact'},
 {'contactType':

In [29]:
item_json['tags'].append({'name': 'Data Release - In Progress', 'scheme': 'Common geographic areas', 'type': 'Browse Categories'})

In [30]:
item_json['webLinks'] = []

In [61]:
item_json = sb.update_item(item_json)

{'body': "These data were collected from Burmese pythons removed from the Florida everglades as part of invasive-species management. After euthanasia, we sexed (male or female) and measured the snout-vent length (SVL; cm) and total body mass (g) for each python. We also measured total fat mass (g) by removing all visible fat bodies from the coelomic cavity and weighing this mass. For a subset of specimens, we recorded whether the pythons were put on ice after euthanasia and measured within 24 hours ('fresh') or whether the pythons were frozen after euthanasia, thawed, and then measured ('frozen'). These data were used to validate several body condition indices in Burmese pythons.",
 'browseTypes': ['Citation'],
 'citation': 'Bryan G. Falk, Invasive Species Science, U.S. Geological Survey, 2017, Sex, length, total mass, fat mass, and specimen condition data for 248 Burmese pythons (Python bivittatus) collected in the Florida Everglades: U.S. Geological Survey data release, https://doi.o

### Replace the onlink in our MD with the DOI, both on the local copy and the SB copy

In [35]:
doi_url = "https://doi.org/{}".format(new_doi[4:])
md.metadata.idinfo.citation.citeinfo.onlink.text = doi_url

In [36]:
md.save()

In [37]:
md_fname

'Z:\\FORT_DataManagement\\Falk_2017_PythonBodyCondition\\FinalData\\Falk 2017 python body condition data.csv.xml'

In [38]:
sb.replace_file(md_fname, item_json)