Skip to content

Commit 689b760

Browse files
committed
update endpoint handling
1 parent e5e380e commit 689b760

File tree

63 files changed

+6922
-5941
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+6922
-5941
lines changed

colrev_core/built_in/data.py

Lines changed: 944 additions & 0 deletions
Large diffs are not rendered by default.

colrev_core/built_in/database_connectors.py

Lines changed: 711 additions & 0 deletions
Large diffs are not rendered by default.

colrev_core/built_in/pdf_get.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#! /usr/bin/env python
2+
import json
3+
import os
4+
from pathlib import Path
5+
6+
import requests
7+
import zope.interface
8+
from pdfminer.high_level import extract_text
9+
10+
from colrev_core.process import PDFRetrievalEndpoint
11+
from colrev_core.record import RecordState
12+
13+
14+
@zope.interface.implementer(PDFRetrievalEndpoint)
15+
class UnpaywallEndpoint:
16+
@classmethod
17+
def __unpaywall(
18+
cls, *, REVIEW_MANAGER, doi: str, retry: int = 0, pdfonly: bool = True
19+
) -> str:
20+
21+
url = "https://api.unpaywall.org/v2/{doi}"
22+
23+
try:
24+
r = requests.get(url, params={"email": REVIEW_MANAGER.EMAIL})
25+
26+
if r.status_code == 404:
27+
return "NA"
28+
29+
if r.status_code == 500:
30+
if retry < 3:
31+
return cls.__unpaywall(
32+
REVIEW_MANAGER=REVIEW_MANAGER, doi=doi, retry=retry + 1
33+
)
34+
else:
35+
return "NA"
36+
37+
best_loc = None
38+
best_loc = r.json()["best_oa_location"]
39+
except json.decoder.JSONDecodeError:
40+
return "NA"
41+
except KeyError:
42+
return "NA"
43+
except requests.exceptions.RequestException:
44+
return "NA"
45+
46+
if not r.json()["is_oa"] or best_loc is None:
47+
return "NA"
48+
49+
if best_loc["url_for_pdf"] is None and pdfonly is True:
50+
return "NA"
51+
else:
52+
return best_loc["url_for_pdf"]
53+
54+
@classmethod
55+
def __is_pdf(cls, *, path_to_file: str) -> bool:
56+
try:
57+
extract_text(path_to_file)
58+
return True
59+
except: # noqa E722
60+
return False
61+
62+
@classmethod
63+
def get_pdf(cls, REVIEW_MANAGER, RECORD):
64+
65+
if "doi" not in RECORD.data:
66+
return RECORD
67+
68+
pdf_filepath = REVIEW_MANAGER.paths["PDF_DIRECTORY_RELATIVE"] / Path(
69+
f"{RECORD.data['ID']}.pdf"
70+
)
71+
url = cls.__unpaywall(REVIEW_MANAGER=REVIEW_MANAGER, doi=RECORD.data["doi"])
72+
if "NA" != url:
73+
if "Invalid/unknown DOI" not in url:
74+
res = requests.get(
75+
url,
76+
headers={
77+
"User-Agent": "Chrome/51.0.2704.103",
78+
"referer": "https://www.doi.org",
79+
},
80+
)
81+
if 200 == res.status_code:
82+
with open(pdf_filepath, "wb") as f:
83+
f.write(res.content)
84+
if cls.__is_pdf(path_to_file=pdf_filepath):
85+
REVIEW_MANAGER.report_logger.info(
86+
"Retrieved pdf (unpaywall):" f" {pdf_filepath.name}"
87+
)
88+
REVIEW_MANAGER.logger.info(
89+
"Retrieved pdf (unpaywall):" f" {pdf_filepath.name}"
90+
)
91+
RECORD.data.update(file=str(pdf_filepath))
92+
RECORD.data.update(
93+
colrev_status=RecordState.rev_prescreen_included
94+
)
95+
else:
96+
os.remove(pdf_filepath)
97+
else:
98+
REVIEW_MANAGER.logger.info(
99+
"Unpaywall retrieval error " f"{res.status_code}/{url}"
100+
)
101+
102+
return RECORD
103+
104+
105+
@zope.interface.implementer(PDFRetrievalEndpoint)
106+
class LocalIndexEndpoint:
107+
@classmethod
108+
def get_pdf(cls, REVIEW_MANAGER, RECORD):
109+
from colrev_core.environment import LocalIndex, RecordNotInIndexException
110+
111+
LOCAL_INDEX = LocalIndex()
112+
try:
113+
retrieved_record = LOCAL_INDEX.retrieve(
114+
record=RECORD.data, include_file=True
115+
)
116+
# print(Record(retrieved_record))
117+
except RecordNotInIndexException:
118+
pass
119+
return RECORD
120+
121+
if "file" in retrieved_record:
122+
RECORD.data["file"] = retrieved_record["file"]
123+
REVIEW_MANAGER.REVIEW_DATASET.import_file(record=RECORD.data)
124+
125+
return RECORD
126+
127+
128+
@zope.interface.implementer(PDFRetrievalEndpoint)
129+
class WebsiteScreenshotEndpoint:
130+
@classmethod
131+
def get_pdf(cls, REVIEW_MANAGER, RECORD):
132+
from colrev_core.environment import ScreenshotService
133+
134+
if "online" == RECORD.data["ENTRYTYPE"]:
135+
SCREENSHOT_SERVICE = ScreenshotService()
136+
SCREENSHOT_SERVICE.start_screenshot_service()
137+
138+
pdf_filepath = REVIEW_MANAGER.paths["PDF_DIRECTORY_RELATIVE"] / Path(
139+
f"{RECORD.data['ID']}.pdf"
140+
)
141+
RECORD = SCREENSHOT_SERVICE.add_screenshot(
142+
RECORD=RECORD, pdf_filepath=pdf_filepath
143+
)
144+
145+
if "file" in RECORD.data:
146+
REVIEW_MANAGER.REVIEW_DATASET.import_file(record=RECORD.data)
147+
148+
return RECORD

0 commit comments

Comments
 (0)