Skip to content

Commit

Permalink
Merge pull request #90 from mpsonntag/warningUpdates
Browse files Browse the repository at this point in the history
Additional admin warnings
  • Loading branch information
achilleas-k committed Apr 1, 2021
2 parents 09f5388 + ecf6830 commit 4a16e7d
Show file tree
Hide file tree
Showing 10 changed files with 379 additions and 36 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
[![Tests](https://github.com/g-node/gin-doi/workflows/run-tests/badge.svg?branch=master)](https://github.com/G-Node/gin-doi/actions)
[![Coverage Status](https://coveralls.io/repos/github/G-Node/gin-doi/badge.svg?branch=master)](https://coveralls.io/github/G-Node/gin-doi?branch=master)
[![Go Report Card](https://goreportcard.com/badge/github.com/g-node/gin-doi)](https://goreportcard.com/report/github.com/g-node/gin-doi)
[![PkgGoDev](https://pkg.go.dev/badge/github.com/g-node/gin-doi)](https://pkg.go.dev/github.com/G-Node/gin-doi)

# GIN DOI

GIN-DOI is the G-Node Infrastructure DOI service.
The service can, at the request of a repository owner, copy a public repository, pack everything into an archive file, store it in a safe location, and provide a DOI (digital object identifier) with which the archive can be cited.

Expand All @@ -10,3 +16,13 @@ GIN-DOI fulfills the [DataCite](https://www.datacite.org/) standard which (accor
* Establish easier access to research data on the Internet.
* Increase acceptance of research data as legitimate, citable contributions to the scholarly record.
* Support data archiving that will permit results to be verified and re-purposed for future study.

## Dependencies

gin-doi is dependent on the [G-Node/libgin](https://github.com/G-Node/libgin) and the [G-Node/gin-cli](https://github.com/G-Node/gin-cli).

When building gin-doi from source and using a different version of `libgin` or `gin-cli` than specified in the `go.mod` file, use `go get` to fetch the latest `libgin` or `gin-cli` release or point to a specific commit in master.

As an example:
- `go get github.com/G-Node/libgin` to include the latest release
- `go get github.com/G-Node/libgin@[commit hash]` for a specifc commit in the master branch of G-Node/libgin
11 changes: 7 additions & 4 deletions cmd/gindoid/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,9 @@ func readRepoYAML(infoyml []byte) (*libgin.RepositoryYAML, error) {
return nil, fmt.Errorf("error while reading DOI info: %s", err.Error())
}
if missing := checkMissingValues(yamlInfo); len(missing) > 0 {
missing = deduplicateValues(missing)
log.Print("DOI file is missing entries")
return nil, fmt.Errorf(strings.Join(missing, " "))
return nil, fmt.Errorf(strings.Join(missing, "; "))
}
return yamlInfo, nil
}
Expand Down Expand Up @@ -382,20 +383,22 @@ func readAndValidate(conf *Configuration, repository string) (*libgin.Repository
repoMetadata, err := readRepoYAML(dataciteText)
if err != nil {
log.Print("DOI file invalid")
err := fmt.Errorf("%s<p><i>%s</i></p>", msgInvalidDOI, err.Error())
// reformat error messages
msgs := strings.Split(err.Error(), "; ")
err := fmt.Errorf("%s<div align='left' style='padding-left: 50px;'><i><ul><li>%s</li></ul></i></div>", msgInvalidDOI, strings.Join(msgs, "</li><li>"))
return nil, err
}

// fail registration on missing LICENSE file
_, err = readFileAtURL(repoFileURL(conf, repository, "LICENSE"))
if err != nil {
log.Printf("Failed to fetch LICENSE: %s", err.Error())
return nil, fmt.Errorf(msgNoLicenseFile)
return nil, fmt.Errorf("<p>%s</p>", msgNoLicenseFile)
}

// fail registration if unsupported values have been used
if msgs := validateDataCiteValues(repoMetadata); len(msgs) > 0 {
err := fmt.Errorf("%s<i><p>%s</p></i>", msgInvalidDOI, strings.Join(msgs, "</p><p>"))
err := fmt.Errorf("%s<div align='left' style='padding-left: 50px;'><i><ul><li>%s</li></ul></i></div>", msgInvalidDOI, strings.Join(msgs, "</li><li>"))
return nil, err
}

Expand Down
3 changes: 2 additions & 1 deletion cmd/gindoid/licenses.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ const defaultLicensesJSON = `[
"CC0 1.0 Universal",
"CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
"Creative Commons CC0 1.0 Public Domain Dedication",
"CC0"
"CC0",
"Creative Commons CC0 1.0 Universal"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion cmd/gindoid/messages.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ If you would like to make any changes to the dataset before it is published, or
msgNoLicense = "No valid license provided. Please specify a license URL and name and make sure it matches the license file in the repository."
msgNoLicenseFile = `The LICENSE file is missing. The full text of the license is required to be in the repository when publishing. See the <a href="https://gin.g-node.org/G-Node/Info/wiki/Licensing">Licensing</a> help page for details and links to recommended data licenses.`
msgLicenseMismatch = `The LICENSE file does not match the license specified in the metadata. See the <a href="https://gin.g-node.org/G-Node/Info/wiki/Licensing">Licensing</a> help page for links to full text for available licenses.`
msgInvalidReference = "One of the Reference entries is not valid. Please provide the full citation and type of the reference."
msgInvalidReference = "Not all Reference entries are valid. Please provide the full citation and type of the reference."
msgBadEncoding = `There was an issue with the content of the DOI file (datacite.yml). This might mean that the encoding is wrong. Please see <a href="https://gin.g-node.org/G-Node/Info/wiki/DOIfile">the DOI guide</a> for detailed instructions or contact gin@g-node.org for assistance.`

msgSubmitError = "An internal error occurred while we were processing your request. The G-Node team has been notified of the problem and will attempt to repair it and process your request. We may contact you for further information regarding your request. Feel free to <a href=mailto:gin@g-node.org>contact us</a> if you would like to provide more information or ask about the status of your request."
Expand Down
19 changes: 18 additions & 1 deletion cmd/gindoid/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ func makeUUID(URI string) string {
return hex.EncodeToString(currMd5[:])
}

// deduplicateValues checks a string slice for duplicate
// entries and returns a reduced string slice without any
// duplicates.
func deduplicateValues(dupvals []string) []string {
strmap := make(map[string]bool)
vals := []string{}
for _, val := range dupvals {
if _, exists := strmap[val]; !exists {
strmap[val] = true
vals = append(vals, val)
}
}
return vals
}

// EscXML runs a string through xml.EscapeText.
// This is a utility function for the doi.xml template.
func EscXML(txt string) string {
Expand Down Expand Up @@ -184,7 +199,9 @@ func AuthorBlock(authors []libgin.Creator) template.HTML {
var url, id, affiliationSup string
if author.Identifier != nil {
id = author.Identifier.ID
url = author.Identifier.SchemeURI + id
if author.Identifier.SchemeURI != "" {
url = author.Identifier.SchemeURI + id
}
}

// Author names are LastName, FirstName
Expand Down
36 changes: 34 additions & 2 deletions cmd/gindoid/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,41 @@ package main

import (
"fmt"
"reflect"
"testing"
)

func TestDeduplicateValues(t *testing.T) {
// check empty
check := []string{}
out := deduplicateValues(check)
if !reflect.DeepEqual(check, out) {
t.Fatalf("Slices (empty) are not equal: %v | %v", check, out)
}

// check nothing to deduplicate
check = []string{"a", "b", "c"}
out = deduplicateValues(check)
if !reflect.DeepEqual(check, out) {
t.Fatalf("Slices (no duplicates) are not equal: %v | %v", check, out)
}

// check deduplication
check = []string{"a", "b", "a", "c"}
expected := []string{"a", "b", "c"}
out = deduplicateValues(check)
if !reflect.DeepEqual(expected, out) {
t.Fatalf("Slices (duplicates) are not equal: %v | %v", expected, out)
}

// check no deduplication on different case
check = []string{"A", "b", "a", "B", "c"}
out = deduplicateValues(check)
if !reflect.DeepEqual(check, out) {
t.Fatalf("Slices (no case duplicates) are not equal: %v | %v", check, out)
}
}

// TestAwardNumber checks proper AwardNumber split and return in util.AwardNumber.
func TestAwardNumber(t *testing.T) {
subname := "funder name"
Expand Down Expand Up @@ -34,7 +66,7 @@ func TestAwardNumber(t *testing.T) {
}

// Test no issue on empty string
outstr = AwardNumber("")
_ = AwardNumber("")

// Test proper split on comma with semi-colon and surrounding whitespaces
subnumissue := " award, num "
Expand Down Expand Up @@ -73,7 +105,7 @@ func TestFunderName(t *testing.T) {
}

// Test no issue on empty string
outstr = FunderName("")
_ = FunderName("")

// Test proper split on comma with semi-colon and surrounding whitespaces
subnameissue := " funder, name "
Expand Down
118 changes: 95 additions & 23 deletions cmd/gindoid/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/G-Node/libgin/libgin"
Expand All @@ -22,28 +23,6 @@ var allowedValues = map[string][]string{
// may need admin attention. These should be sent with the followup
// notification email.
func collectWarnings(job *RegistrationJob) (warnings []string) {
// Check if any funder IDs are missing
if job.Metadata.FundingReferences != nil {
for _, funder := range *job.Metadata.FundingReferences {
if funder.Identifier == nil || funder.Identifier.ID == "" {
warnings = append(warnings, fmt.Sprintf("Couldn't find funder ID for funder %q", funder.Funder))
}
}
}

// Check if a reference from the YAML file uses the old "Name" field instead of "Citation"
// This shouldn't be an issue, but it can cause formatting issues
for idx, ref := range job.Metadata.YAMLData.References {
if ref.Name != "" {
warnings = append(warnings, fmt.Sprintf("Reference %d uses old 'Name' field instead of 'Citation'", idx))
}
}

// The 80 character limit is arbitrary, but if the abstract is very short, it's worth a check
if absLen := len(job.Metadata.YAMLData.Description); absLen < 80 {
warnings = append(warnings, fmt.Sprintf("Abstract may be too short: %d characters", absLen))
}

// NOTE: This is a workaround for the current inability to check a
// potential DOI fork for previous releases. If the repository has a DOI
// fork, a notice is added to the admin email to check for previous
Expand All @@ -57,12 +36,105 @@ func collectWarnings(job *RegistrationJob) (warnings []string) {
}
}

// Check authors
warnings = authorWarnings(job.Metadata.YAMLData, warnings)

// The 80 character limit is arbitrary, but if the abstract is very short, it's worth a check
if absLen := len(job.Metadata.YAMLData.Description); absLen < 80 {
warnings = append(warnings, fmt.Sprintf("Abstract may be too short: %d characters", absLen))
}

// Check licenses
repoLicURL := repoFileURL(job.Config, job.Metadata.SourceRepository, "LICENSE")
warnings = licenseWarnings(job.Metadata.YAMLData, repoLicURL, warnings)

// Check if any funder IDs are missing
if job.Metadata.FundingReferences != nil {
for _, funder := range *job.Metadata.FundingReferences {
if funder.Identifier == nil || funder.Identifier.ID == "" {
warnings = append(warnings, fmt.Sprintf("Couldn't find funder ID for funder %q", funder.Funder))
}
}
}

// Check references
warnings = referenceWarnings(job.Metadata.YAMLData, warnings)

// Warn if resourceType is not 'Dataset'
if !strings.EqualFold(job.Metadata.YAMLData.ResourceType, "dataset") {
warnings = append(warnings, fmt.Sprintf("ResourceType is %q (expected Dataset)", job.Metadata.YAMLData.ResourceType))
}

return
}

// authorWarnings checks datacite authors for validity and returns
// corresponding warnings if required.
func authorWarnings(yada *libgin.RepositoryYAML, warnings []string) []string {
var orcidRE = regexp.MustCompile(`([[:digit:]]{4}-){3}[[:digit:]]{3}[[:digit:]X]`)
var dupID = make(map[string]string)

for idx, auth := range yada.Authors {
if auth.ID == "" {
continue
}
lowerID := strings.ToLower(auth.ID)

// Warn when not able to identify ID type
if !strings.HasPrefix(lowerID, "orcid") && !strings.HasPrefix(lowerID, "researcherid") {
if orcid := orcidRE.Find([]byte(auth.ID)); orcid != nil {
warnings = append(warnings, fmt.Sprintf("Author %d (%s) has ORCID-like unspecified ID: %s", idx, auth.LastName, auth.ID))
} else {
warnings = append(warnings, fmt.Sprintf("Author %d (%s) has unknown ID: %s", idx, auth.LastName, auth.ID))
}
}

// Warn on known ID type but missing value
idpref := map[string]bool{"orcid:": true, "researcherid:": true}
if _, found := idpref[strings.TrimSpace(lowerID)]; found {
warnings = append(warnings, fmt.Sprintf("Author %d (%s) has empty ID value: %s", idx, auth.LastName, auth.ID))
}

// Warn on dupliate ID entries
if authName, isduplicate := dupID[lowerID]; isduplicate {
curr := fmt.Sprintf("%d (%s)", idx, auth.LastName)
warnings = append(warnings, fmt.Sprintf("Authors %s and %s have the same ID: %s", authName, curr, auth.ID))
} else {
dupID[lowerID] = fmt.Sprintf("%d (%s)", idx, auth.LastName)
}
}

return warnings
}

// referenceWarnings checks datacite references for validity and
// returns corresponding warnings if required.
func referenceWarnings(yada *libgin.RepositoryYAML, warnings []string) []string {
for idx, ref := range yada.References {
// Check if a reference from the YAML file uses the old "Name" field instead of "Citation"
// This shouldn't be an issue, but it can cause formatting issues
if ref.Name != "" {
warnings = append(warnings, fmt.Sprintf("Reference %d uses old 'Name' field instead of 'Citation'", idx))
}

// Warn if reftypes are different from "IsSupplementTo"
if strings.ToLower(ref.RefType) != "issupplementto" {
warnings = append(warnings, fmt.Sprintf("Reference %d uses refType '%s'", idx, ref.RefType))
}

// Warn if a reference does not provide a relatedIdentifier
var relIDType string
refIDParts := strings.SplitN(ref.ID, ":", 2)
if len(refIDParts) == 2 {
relIDType = strings.TrimSpace(refIDParts[0])
}
if relIDType == "" {
warnings = append(warnings, fmt.Sprintf("Reference %d has no related ID type: '%s'; excluded from XML file", idx, ref.ID))
}
}
return warnings
}

// DOILicense holds Name (official license title), URL (license online reference)
// and Alias names for a license used for a DOI registration.
type DOILicense struct {
Expand Down Expand Up @@ -93,7 +165,7 @@ func licenseWarnings(yada *libgin.RepositoryYAML, repoLicenseURL string, warning
var licenseHeader DOILicense
content, err := readFileAtURL(repoLicenseURL)
if err != nil {
warnings = append(warnings, fmt.Sprintf("Could not access license file"))
warnings = append(warnings, "Could not access license file")
} else {
headstr := string(content)
fileHeader := strings.Split(strings.Replace(headstr, "\r\n", "\n", -1), "\n")
Expand Down
Loading

0 comments on commit 4a16e7d

Please sign in to comment.