Support any URL with doi marker

DJRHails · Sep 8, 2022 · 5af1462 · 5af1462
1 parent f66129c
commit 5af1462
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 9 deletions.
diff --git a/pdf2doi/patterns.py b/pdf2doi/patterns.py
@@ -48,10 +48,10 @@ def standardise_doi(identifier):
               '(10\.\d{4}[\d\:\.\-\/a-z]+)(?:[\s\n\"<]|$)',                 # in version 1 the requirement of having "DOI : " in the beginning is removed
               '(10\.\d{4}[\:\.\-\/a-z]+[\:\.\-\d]+)(?:[\s\na-z\"<]|$)',     # version 2 is useful for cases in which, in plain texts, the DOI is not followed by a space, newline or special characters,
                                                                             #but is instead followed by other letters. In this case we can still isolate the DOI if we assume that the DOI always ends with numbers
-              'http[s]?://doi.org/(10\.\d{4,9}/[-._;()/:A-Z0-9]+)(?:[\s\n\"<]|$)', # version 3 is useful when the DOI can be found in a google result as an URL of the form https://doi.org/[DOI]
-                                                                            #The regex for [DOI] is 10\.\d{4,9}/[-._;()/:A-Z0-9]+ (taken from here https://www.crossref.org/blog/dois-and-matching-regular-expressions/)
+              'https?://[ -~]*doi[ -~]*/(10\.\d{4,9}/[-._;()/:a-z0-9]+)(?:[\s\n\"<]|$)', # version 3 is useful when the DOI can be found in a google result as an URL of the form https://doi.org/[DOI]
+                                                                            #The regex for [DOI] is 10\.\d{4,9}/[-._;()/:a-z0-9]+ (taken from here https://www.crossref.org/blog/dois-and-matching-regular-expressions/)
                                                                             #and it must be followed by a valid ending character: either a speace, a new line, a ", a <, or end of string.
-              '^(10\.\d{4,9}/[-._;()/:A-Z0-9]+)$']                         # version 4 is like version 3, but without the requirement of the url https://doi.org/ in front of it.
+              '^(10\.\d{4,9}/[-._;()/:a-z0-9]+)$']                         # version 4 is like version 3, but without the requirement of the url https://doi.org/ in front of it.
                                                                             #However, it requires that the string contains ONLY the doi and nothing else. This is useful for when the DOI is stored in metadata
 
 

diff --git a/pdf2doi/test_patterns.py b/pdf2doi/test_patterns.py
@@ -54,16 +54,19 @@ def test_standardise_doi(suspected, expected):
 
 @pytest.mark.parametrize(["suspected", "expected"], [
     ["10.1109/sp.2011.40"] * 2,
-    ["doi10.1177:0146167297234003", "doi10.1177:0146167297234003"],
-    ["10.1177:0146167297234003.pdf", "10.1177:0146167297234003.pdf"],
+    ["doi10.1177:0146167297234003", "10.1177/0146167297234003"],
+    ["10.1177:0146167297234003.pdf", "10.1177/0146167297234003.pdf"],
+    ["https://journals.sagepub.com/doi/pdf/10.1177/0146167297234003", "10.1177/0146167297234003"],
+    ["https://doi.org/10.1109/sp.2011.40", "10.1109/sp.2011.40"]
 ])
 def test_is_loose_doi_match(suspected, expected):
     print(suspected)
     for ver, regex in enumerate(doi_regexp):
-        match = re.match(regex, suspected,re.I)
-        if match is not None:
-            assert match.group() == expected
-            print(f"{ver} matched.")
+
+        identifiers = re.findall(regex, suspected.lower())
+        if identifiers:
+            print(f"Matched with {ver} - {identifiers}")
+            assert standardise_doi(identifiers[0]) == expected
             return
         print(f"{ver} failed.")
     assert False