## A workflow for automating the creation of a ScienceBase Data Release from the content of a CSDGM metadata record

#### Created by the Data Managment Team of the Fort Collins Science Center with help from the SB team

In [1]:
import requests
import pysb
import random
import smtplib
import json
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
from requests_ntlm import HttpNtlmAuth

import getpass

from pymdwizard.core.xml_utils import XMLRecord, XMLNode
from pymdwizard.core import utils

### identify the items we'll need for this process:

#### The CSDGM metadata record,  A list of data files to include in the release,  the IPDS number

In [2]:
md_fname = r"Z:\FORT_DataManagement\Ruth_2017_GRSPHabitat\Habitat Data Arizona Grasshopper Sparrow Territory Nest and Random 2009 to 2013.xml"
# data_files = [r"Z:\FORT_DataManagement\Falk_2017_PythonBodyCondition\FinalData\Falk 2017 python body condition data.csv", ]
ipds_number = 'IP-090592'

#### get the username and password we'll be using

In [3]:
username = "talbertc@usgs.gov"
password = getpass.getpass()

········


#### extract the information needed from the MD record

In [4]:
md = XMLRecord(md_fname)

In [5]:
title = md.metadata.idinfo.citation.citeinfo.title.text
title

'Habitat Data for Arizona Grasshopper Sparrow Territories, Nest Plots, and Random Transects, 2009 to 2013'

In [6]:
abstract = md.metadata.idinfo.descript.abstract.text
abstract

'These data provide information about vegetation structure and composition associated with Arizona Grasshopper Sparrow territories, nest plots, and random transects on two study sites - Audubon Appleton-Whittell Research Ranch and BLM Las Cienegas National Conservation Area (NCA) - Davis Pasture - in southeastern Arizona.  Data were collected from 2009 through 2013.  These are data associated with Ruth and Skagen (2017) cited above.'

In [7]:
pubdate = md.metadata.idinfo.citation.citeinfo.pubdate.text
pubyear = pubdate[:4]
pubdate, pubyear

('201707', '2017')

### Pull the authors and ORCID's from IPDS

In [8]:
ipds_session = requests.Session()
#you will need to change this to your AD username. When prompted enter AD password
ipds_session.auth = HttpNtlmAuth('GS\\{}'.format(username.split('@')[0]), password, ipds_session)

In [9]:
authors_url = "https://ipds.usgs.gov/_vti_bin/listdata.svc/IPDSAuthors()?$filter=IPNumber%20eq%20%27{}%27".format(ipds_number)

content = ipds_session.get(authors_url)
soup = BeautifulSoup(content.text, "lxml-xml")
author_list = []
for entry in soup.find_all('entry'):
    record = {}
    record['author_name'] = entry.find('AuthorNameText').string
    record['ORCID'] = str(entry.find('ORCID').string)
    author_list.append(record)
    
author_list

[{'ORCID': '0000-0003-1576-5957', 'author_name': 'Ruth, Janet M.'}]

### Make the SB item from our metadata and upload our data to it

In [11]:
sb = pysb.SbSession()
sb.login(username, password)

<pysb.SbSession.SbSession at 0x77485c0>

In [12]:
root_release_id = '552d79bee4b0b22a157f59a3'
# root_release_id = '4ff5c8f9e4b03b1a74b5782d' #testing
item_json = sb.upload_file_and_create_item(root_release_id, md_fname)

In [13]:
item_id = item_json['id']
item_id

'59b196a4e4b020cdf7d957e6'

### Create the DOI

In [18]:
rsc_url = 'https://www.sciencebase.gov/catalog/item/' + item_id

r = requests.Session()

doi_tool_url = 'https://www1.usgs.gov/csas/doi/' # production
# doi_tool_url = 'https://www1-staging.snafu.cr.usgs.gov/csas/doi/' #testing

req = r.get(doi_tool_url, verify=False) #
form_csrf  = req.content.decode('utf-8').split('name="_csrf" value="')[1].split('" />\n')[0]
cred = {'j_username': username, 'j_password': password, '_csrf': form_csrf}
response = r.post(doi_tool_url + 'j_spring_security_check', cookies=req.cookies, data=cred, verify=False)


key_username = 'sciencebase@ornl.gov'
key5 = 'usersAndTypes[' + key_username + ']'
value5 = 'PRIMARY'
# specific user credentials, will dup if above already
key6 = 'usersAndTypes[talbertc@usgs.gov]'
value6 = 'PRIMARY'
key7 = 'usersAndTypes[afreeman@usgs.gov]'
value7 = 'PRIMARY'


d = {'identifier':'',
    'title':title,
    'resourceURL': rsc_url,
    key5: value5, key6: value6, key7: value7,
    'addNewCreatorAuthor': '', 
    'addNewCreatorAuthorOrcid': '',
    'authorValidity':'valid',
    'abstract_type_description' : abstract,
    'datasource_id': 17368,  #This number is for the Fort Collins Science Center
    'subject' : '',
    'project_year' : '',
    'project_date' : '',
    'project_start_year' : '',
    'project_end_year' : '',
    'project_start_date' : '',
    'project_end_date' : '',
    'date_type' : '',
    'date' : '',
    'pubYear':str(pubyear), #
    'resourceType': 'Dataset',
    'publisher': 'U.S. Geological Survey',
    'status': 'reserved',
    '_csrf': form_csrf,
    'save': 'Submit'}

for i, author in enumerate(author_list):
    d['authors[{}].authorName'.format(i)] = author['author_name']
    d['authors[{}].orcId'.format(i)] = author['ORCID']
    d['authors[{}].position'.format(i)] = i
    
create_new_doi = r.post(doi_tool_url + 'result.htm', cookies = req.cookies, data = d, verify = False)
create_new_doi



<Response [200]>

In [19]:
new_doi = create_new_doi.content.decode('utf-8').split('Your DOI has been saved: ')[1].split('</div>')[0].replace(" ","").strip()
new_doi

'doi:10.5072/FK22B92D90'

### Update our SB item to include the IPDS link and DOI 

In [15]:
ipds_format = {'type':"IPDS", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':ipds_number}
doi_format = {'type':"DOI", "scheme":"https://www.sciencebase.gov/vocab/category/item/identifier", 'key':new_doi}
item_json = sb.updateSbItem({'id':item_id, 'identifiers':[ipds_format, doi_format]})

### update the SB citation with our DOI

In [16]:
item_json['citation']

'Janet M Ruth, 2017, Life history attributes data for Arizona Grasshopper Sparrow (Ammodramus savannarum ammolegus) in Arizona 2013: .'

In [17]:
item_json = sb.get_item(item_id)
citation = item_json['citation']
citation

'Janet M Ruth, 2017, Life history attributes data for Arizona Grasshopper Sparrow (Ammodramus savannarum ammolegus) in Arizona 2013: .'

In [18]:
new_citation = "".join(citation.split(':')[:-1]) + ": U.S. Geological Survey data release, https://doi.org/{}.".format(new_doi[4:])
item_json['citation'] = new_citation
new_citation

'Janet M Ruth, 2017, Life history attributes data for Arizona Grasshopper Sparrow (Ammodramus savannarum ammolegus) in Arizona 2013: U.S. Geological Survey data release, https://doi.org/10.5066/F7C53JCF.'

In [20]:
# item_json['tags'].append({'name': 'Data Release - In Progress', 'scheme': 'Common geographic areas', 'type': 'Browse Categories'})
item_json['browseCategories'] = ['Data Release - In Progress']

In [21]:
item_json['webLinks'] = []

In [22]:
item_json = sb.update_item(item_json)

### Replace the onlink in our MD with the DOI, both on the local copy and the SB copy

In [28]:
doi_url = "https://doi.org/{}".format(new_doi[4:])
md.metadata.idinfo.citation.citeinfo.onlink.text = doi_url
md.metadata.distinfo.stdorder.digform.digtopt.onlinopt.computer.networka.networkr.text = doi.url

In [29]:
md.save()

In [32]:
md_fname

'C:\\Users\\talbertc\\Downloads\\Arizona_GRSP_lifehistory_data.xml'

In [34]:
sb.replace_file(md_fname, item_json)

In [35]:
md_fname

'C:\\Users\\talbertc\\Downloads\\Arizona_GRSP_lifehistory_data.xml'

## Remove items, replace online and networkr with doi, and upload matched csv/xml as child items

In [22]:
sb.get_item_file_info(item_json)

[{'name': 'Arizona_GRSP_lifehistory_data.xml',
  'size': 12331,
  'url': 'https://www.sciencebase.gov/catalog/file/get/592727dfe4b0b7ff9fb5dca1?f=__disk__6d%2F29%2F3b%2F6d293b570148818f447a232bf39bd21788c7dae0'},
 {'name': 'ARCHIVE 2013 AZ GRSP Egg Measurements.csv',
  'size': 2242,
  'url': 'https://www.sciencebase.gov/catalog/file/get/592727dfe4b0b7ff9fb5dca1?f=__disk__dd%2Fda%2F89%2Fddda89c28d79aa70a1fb2e390d0a8e7842123087'},
 {'name': 'ARCHIVE 2013 AZ GRSP Egg Measurements.csv.xml',
  'size': 15185,
  'url': 'https://www.sciencebase.gov/catalog/file/get/592727dfe4b0b7ff9fb5dca1?f=__disk__65%2Fb0%2Ff5%2F65b0f50d0295bcbc0e06aa78f87c76edcb6c5da9'},
 {'name': 'ARCHIVE AZ GRSP adult male return rates.csv',
  'size': 10226,
  'url': 'https://www.sciencebase.gov/catalog/file/get/592727dfe4b0b7ff9fb5dca1?f=__disk__18%2Fdb%2F42%2F18db42d42324ddba5eb88cd87c9cf6af4590be04'},
 {'name': 'ARCHIVE AZ GRSP adult male return rates.csv.xml',
  'size': 22165,
  'url': 'https://www.sciencebase.gov/cat

In [24]:
doi = md.metadata.idinfo.citation.citeinfo.onlink.text
doi

'https://doi.org/10.5066/F7C53JCF'

In [33]:
import os
dname = r'C:\Users\talbertc\Downloads\Lifehistoryattr (1)'
files = os.listdir(dname)

In [30]:
csvs = [f for f in files if f.endswith('.csv')]
csvs

['ARCHIVE 2013 AZ GRSP Egg Measurements.csv',
 'ARCHIVE AZ GRSP adult male return rates.csv',
 'ARCHIVE AZ GRSP bird measurements.csv',
 'ARCHIVE AZ GRSP Clutch Size.csv',
 'ARCHIVE AZ GRSP Nest Dimensions.csv',
 'ARCHIVE AZ GRSP Nest Plant.csv',
 'ARCHIVE AZ GRSP Territory Size KDE.csv',
 'ARCHIVE How AZ GRSP Nest Found.csv']

#### This is where I added ....

In [56]:
# ddkkdkd
for csv in csvs:
    print(csv)
    csv_fname = os.path.join(dname, csv)
    xml_fname = csv_fname + '.xml'
    this_md = XMLRecord(xml_fname)
    this_md.metadata.idinfo.citation.citeinfo.onlink.text = doi
    this_md.metadata.distinfo.stdorder.digform.digtopt.onlinopt.computer.networka.networkr.text = doi
    this_md.save()
    
    child_item = sb.upload_file_and_create_item(item_id, xml_fname)
    child_id = child_item['id']
    sb.upload_file_to_item(child_item, csv_fname)
    
    citation = child_item['citation']
    child_item['citation'] = citation.replace(': , https://doi.org', ': U.S. Geological Survey data release, https://doi.org')
    
    child_item['title'] = child_item['title'].replace('Grasshopper Sparrow (Ammodramus savannarum ammolegus) ', '').capitalize()
    sb.update_item(child_item)

ARCHIVE 2013 AZ GRSP Egg Measurements.csv
ARCHIVE AZ GRSP adult male return rates.csv
ARCHIVE AZ GRSP bird measurements.csv
ARCHIVE AZ GRSP Clutch Size.csv
ARCHIVE AZ GRSP Nest Dimensions.csv
ARCHIVE AZ GRSP Nest Plant.csv
ARCHIVE AZ GRSP Territory Size KDE.csv
ARCHIVE How AZ GRSP Nest Found.csv


In [40]:
child_id

'59446576e4b062508e323325'

In [47]:
citation = child_item['citation']
citation = citation.replace(': , https://doi.org', ': U.S. Geological Survey data release, https://doi.org')

'Janet M Ruth, 2017, Grasshopper Sparrow (Ammodramus savannarum ammolegus) how nests found Arizona 2011-2013: U.S. Geological Survey data release, https://doi.org/10.5066/F7C53JCF.'

In [53]:

new_title.capitalize()

'How nests found arizona 2011-2013'

In [55]:
for child_id in sb.get_child_ids(item_id):
    child_item = sb.get_item(child_id)
    sb.delete_item(child_item)

# Final update of of publication DOI and Citation in MD and SB item (optional)

#### The Author provides a DOI of the publication associated with this data release

In [1]:
pub_doi = '10.1642/AUK-16-195.1'  #insert DOI here!

In [2]:
from habanero import cn

In [9]:
pub_doi = u'10.1126/science.169.3946.635'
citation = cn.content_negotiation(ids = pub_doi, format = "text", style="apa", verify=False)
print(bytearray(citation, 'Latin-1').decode())



Frank, H. S. (1970). The Structure of Ordinary Water: New data and interpretations are yielding new insights into this fascinating substance. Science, 169(3946), 635–641. doi:10.1126/science.169.3946.635





In [10]:
citation2 = cn.content_negotiation(ids = pub_doi, format = "text", style="ieee", verify=False)
print(bytearray(citation2, 'Latin-1').decode())



[1]H. S. Frank, “The Structure of Ordinary Water: New data and interpretations are yielding new insights into this fascinating substance,” Science, vol. 169, no. 3946, pp. 635–641, Aug. 1970.





In [11]:
citation != citation2

True

In [107]:
print(cn.content_negotiation(ids = pub_doi, format = "bibtex", style="ieee", verify=False))

@article{Cade_2017,
	doi = {10.1642/auk-16-195.1},
	url = {https://doi.org/10.1642%2Fauk-16-195.1},
	year = 2017,
	month = {jul},
	publisher = {American Ornithologists{\textquotesingle} Union},
	volume = {134},
	number = {4},
	pages = {783--801},
	author = {Brian S. Cade and Barry R. Noon and Rick D. Scherer and John J. Keane},
	title = {Logistic quantile regression provides improved estimates for bounded avian counts: A case study of California Spotted Owl fledgling production},
	journal = {The Auk}
}




In [12]:
cn.content_negotiation?

In [7]:
c = cn.content_negotiation?

In [None]:
c = cn.content_negotiation

In [None]:
c = cn.content_negotiation

In [5]:
c?

Object `c` not found.


In [None]:
c = content_negotiation

In [90]:
import json
cite_data = json.loads(cn.content_negotiation(ids = pub_doi, format = "citeproc-json"))

In [91]:
cite_data.keys()

dict_keys(['relation', 'container-title', 'member', 'issued', 'subtitle', 'publisher', 'reference-count', 'issn-type', 'link', 'source', 'volume', 'original-title', 'author', 'published-online', 'type', 'prefix', 'short-title', 'title', 'score', 'issue', 'subject', 'license', 'short-container-title', 'alternative-id', 'indexed', 'published-print', 'DOI', 'deposited', 'created', 'URL', 'is-referenced-by-count', 'content-domain', 'page', 'ISSN', 'references-count'])

In [98]:
from pymdwizard.core.xml_utils import XMLRecord, XMLNode

def get_doi_citation(doi):
    cite_data = json.loads(cn.content_negotiation(ids=doi, format = "citeproc-json"))
    
    
    root_node = XMLNode(tag='citation')
    citeinfo = XMLNode(tag='citeinfo', parent_node=root_node)
    title = XMLNode(tag='title', parent_node=citeinfo, text=cite_data['title'])
    
    return root_node
    
    

In [99]:
 get_doi_citation(pub_doi)

<citation>
  <citeinfo>

  </citeinfo>
</citation>

In [76]:
cn.content_negotiation?

In [5]:
sb_item = sb.get_item('5910958ce4b0e541a03a85c1')

In [6]:
for s in cn.csl_styles(verify=False):
#     c2 = cn.content_negotiation(ids = pub_doi, format = "text", style=s, verify=False)
#     if c2 != citation:
#         print(s)
#         print(c2)
    print(s)



academy-of-management-review
accident-analysis-and-prevention
acm-sig-proceedings-long-author-list
acm-sig-proceedings
acm-sigchi-proceedings-extended-abstract-format
acm-sigchi-proceedings
acm-siggraph
acs-nano
acta-anaesthesiologica-scandinavica
acta-anaesthesiologica-taiwanica
acta-chirurgiae-orthopaedicae-et-traumatologiae-cechoslovaca
acta-naturae
acta-neurochirurgica
acta-ophthalmologica
acta-orthopaedica-belgica
acta-palaeontologica-polonica
acta-pharmaceutica-sinica-b
acta-pharmaceutica
acta-philosophica
acta-polytechnica
acta-psychiatrica-scandinavica
acta-scientiae-veterinariae
acta-societatis-botanicorum-poloniae
acta-universitatis-agriculturae-et-silviculturae-mendelianae-brunensis
acta-universitatis-agriculturae-sueciae
addiction-biology
administrative-science-quarterly
advanced-engineering-materials
advanced-functional-materials
advanced-materials
advanced-optical-materials
advances-in-alzheimers-disease
advances-in-complex-systems
aerosol-science-and-technology
african-j

presses-universitaires-de-paris-nanterre
presses-universitaires-de-rennes-archeologie-et-culture
presses-universitaires-de-rennes
primary-care-clinics-in-office-practice
proceedings-of-the-royal-society-b
proinflow
protein-engineering-design-and-selection
protein-science
proteomics
psychiatric-clinics-of-north-america
psychiatric-services
psychiatry-and-clinical-neurosciences
psychological-medicine
psychosomatic-medicine
psychosomatics
public-health-nutrition
quaderni-degli-avogadro-colloquia
quaternary-international
r-and-d-management
radiochimica-acta
radiographics
radiography
radiologic-clinics-of-north-america
radiology
radiopaedia
rapid-communications-in-mass-spectrometry
recent-patents-on-drug-delivery-and-formulation
recherches-en-sciences-de-gestion
renewable-agriculture-and-food-systems
reports-of-practical-oncology-and-radiotherapy
reproduction-in-domestic-animals
reproduction
research-on-biomedical-engineering
restoration-ecology
reviews-of-modern-physics-with-titles
revista