Skip to content

Commit

Permalink
Support any URL with doi marker
Browse files Browse the repository at this point in the history
  • Loading branch information
DJRHails committed Sep 8, 2022
1 parent f66129c commit 5af1462
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 9 deletions.
6 changes: 3 additions & 3 deletions pdf2doi/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def standardise_doi(identifier):
'(10\.\d{4}[\d\:\.\-\/a-z]+)(?:[\s\n\"<]|$)', # in version 1 the requirement of having "DOI : " in the beginning is removed
'(10\.\d{4}[\:\.\-\/a-z]+[\:\.\-\d]+)(?:[\s\na-z\"<]|$)', # version 2 is useful for cases in which, in plain texts, the DOI is not followed by a space, newline or special characters,
#but is instead followed by other letters. In this case we can still isolate the DOI if we assume that the DOI always ends with numbers
'http[s]?://doi.org/(10\.\d{4,9}/[-._;()/:A-Z0-9]+)(?:[\s\n\"<]|$)', # version 3 is useful when the DOI can be found in a google result as an URL of the form https://doi.org/[DOI]
#The regex for [DOI] is 10\.\d{4,9}/[-._;()/:A-Z0-9]+ (taken from here https://www.crossref.org/blog/dois-and-matching-regular-expressions/)
'https?://[ -~]*doi[ -~]*/(10\.\d{4,9}/[-._;()/:a-z0-9]+)(?:[\s\n\"<]|$)', # version 3 is useful when the DOI can be found in a google result as an URL of the form https://doi.org/[DOI]
#The regex for [DOI] is 10\.\d{4,9}/[-._;()/:a-z0-9]+ (taken from here https://www.crossref.org/blog/dois-and-matching-regular-expressions/)
#and it must be followed by a valid ending character: either a speace, a new line, a ", a <, or end of string.
'^(10\.\d{4,9}/[-._;()/:A-Z0-9]+)$'] # version 4 is like version 3, but without the requirement of the url https://doi.org/ in front of it.
'^(10\.\d{4,9}/[-._;()/:a-z0-9]+)$'] # version 4 is like version 3, but without the requirement of the url https://doi.org/ in front of it.
#However, it requires that the string contains ONLY the doi and nothing else. This is useful for when the DOI is stored in metadata


Expand Down
15 changes: 9 additions & 6 deletions pdf2doi/test_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,19 @@ def test_standardise_doi(suspected, expected):

@pytest.mark.parametrize(["suspected", "expected"], [
["10.1109/sp.2011.40"] * 2,
["doi10.1177:0146167297234003", "doi10.1177:0146167297234003"],
["10.1177:0146167297234003.pdf", "10.1177:0146167297234003.pdf"],
["doi10.1177:0146167297234003", "10.1177/0146167297234003"],
["10.1177:0146167297234003.pdf", "10.1177/0146167297234003.pdf"],
["https://journals.sagepub.com/doi/pdf/10.1177/0146167297234003", "10.1177/0146167297234003"],
["https://doi.org/10.1109/sp.2011.40", "10.1109/sp.2011.40"]
])
def test_is_loose_doi_match(suspected, expected):
print(suspected)
for ver, regex in enumerate(doi_regexp):
match = re.match(regex, suspected,re.I)
if match is not None:
assert match.group() == expected
print(f"{ver} matched.")

identifiers = re.findall(regex, suspected.lower())
if identifiers:
print(f"Matched with {ver} - {identifiers}")
assert standardise_doi(identifiers[0]) == expected
return
print(f"{ver} failed.")
assert False

0 comments on commit 5af1462

Please sign in to comment.