Skip to content

Commit

Permalink
refactor load
Browse files Browse the repository at this point in the history
  • Loading branch information
geritwagner committed Jul 7, 2022
1 parent 038f3a5 commit 709cf61
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 118 deletions.
104 changes: 104 additions & 0 deletions colrev_core/built_in/database_connectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,35 @@
from colrev_core.record import RecordState


class OpenLibraryConnector:
@classmethod
def check_status(cls, *, PREPARATION) -> None:
from colrev_core.prep import ServiceNotAvailableException

test_rec = {
"ENTRYTYPE": "book",
"isbn": "9781446201435",
# 'author': 'Ridley, Diana',
"title": "The Literature Review A Stepbystep Guide For Students",
"ID": "Ridley2012",
"year": "2012",
}
try:
url = f"https://openlibrary.org/isbn/{test_rec['isbn']}.json"
ret = requests.get(
url, headers=PREPARATION.requests_headers, timeout=PREPARATION.TIMEOUT
)
if ret.status_code != 200:
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("OPENLIBRARY")
except requests.exceptions.RequestException:
pass
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("OPENLIBRARY")

return


class URLConnector:
@classmethod
def retrieve_md_from_url(cls, *, RECORD, PREPARATION) -> None:
Expand Down Expand Up @@ -249,6 +278,41 @@ def meta_redirect(content: str):


class CrossrefConnector:
@classmethod
def check_status(cls, *, PREPARATION) -> None:
from colrev_core.prep import ServiceNotAvailableException

try:
test_rec = {
"doi": "10.17705/1cais.04607",
"author": "Schryen, Guido and Wagner, Gerit and Benlian, Alexander "
"and Paré, Guy",
"title": "A Knowledge Development Perspective on Literature Reviews: "
"Validation of a new Typology in the IS Field",
"ID": "SchryenEtAl2021",
"journal": "Communications of the Association for Information Systems",
}
RETURNED_REC = cls.crossref_query(
REVIEW_MANAGER=PREPARATION.REVIEW_MANAGER,
RECORD_INPUT=PrepRecord(data=test_rec),
jour_vol_iss_list=False,
session=PREPARATION.session,
TIMEOUT=PREPARATION.TIMEOUT,
)[0]

if 0 != len(RETURNED_REC.data):
assert RETURNED_REC.data["title"] == test_rec["title"]
assert RETURNED_REC.data["author"] == test_rec["author"]
else:
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("CROSSREF")
except requests.exceptions.RequestException as e:
print(e)
pass
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("CROSSREF")
return

@classmethod
def crossref_json_to_record(cls, *, item: dict) -> dict:
# Note: the format differst between crossref and doi.org
Expand Down Expand Up @@ -582,6 +646,46 @@ def get_masterdata_from_crossref(


class DBLPConnector:
@classmethod
def check_status(cls, *, PREPARATION) -> None:
from colrev_core.prep import ServiceNotAvailableException

try:
test_rec = {
"ENTRYTYPE": "article",
"doi": "10.17705/1cais.04607",
"author": "Schryen, Guido and Wagner, Gerit and Benlian, Alexander "
"and Paré, Guy",
"title": "A Knowledge Development Perspective on Literature Reviews: "
"Validation of a new Typology in the IS Field",
"ID": "SchryenEtAl2021",
"journal": "Communications of the Association for Information Systems",
"volume": "46",
"year": "2020",
"colrev_status": RecordState.md_prepared, # type: ignore
}

query = "" + str(test_rec.get("title", "")).replace("-", "_")

DBLP_REC = DBLPConnector.retrieve_dblp_records(
REVIEW_MANAGER=PREPARATION.REVIEW_MANAGER,
query=query,
session=PREPARATION.session,
)[0]

if 0 != len(DBLP_REC.data):
assert DBLP_REC.data["title"] == test_rec["title"]
assert DBLP_REC.data["author"] == test_rec["author"]
else:
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("DBLP")
except requests.exceptions.RequestException:
pass
if not PREPARATION.force_mode:
raise ServiceNotAvailableException("DBLP")

return

@classmethod
def retrieve_dblp_records(
cls,
Expand Down
6 changes: 4 additions & 2 deletions colrev_core/built_in/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ class LoadFixesPrep:

def prepare(self, PREPARATION, RECORD):
# TODO : may need to rerun import_provenance

# TODO : store custom load-prep script as source attribute
origin_source = RECORD.data["colrev_origin"].split("/")[0]
origin_source_name = [
s.source_name
for s in PREPARATION.REVIEW_MANAGER.settings.search.sources
if s.filename == Path(origin_source)
if s.filename == Path("search") / Path(origin_source)
][0]

if origin_source_name in [
Expand Down Expand Up @@ -220,6 +220,8 @@ class RemoveBrokenIDPrep:

source_correction_hint = "check with the developer"

# check_status: relies on crossref / openlibrary connectors!

def prepare(self, PREPARATION, RECORD):

if "doi" in RECORD.data:
Expand Down
65 changes: 27 additions & 38 deletions colrev_core/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,23 +206,19 @@ def getbib(*, file: Path) -> typing.List[dict]:
"Replace Early Access Date in bibfile before loading! "
f"{file.name}"
)

with open(file, encoding="utf8") as bibtex_file:
search_records_dict = (
self.REVIEW_MANAGER.REVIEW_DATASET.load_records_dict(
load_str=bibtex_file.read()
)
)

return search_records_dict.values()

def import_record(*, record: dict) -> dict:

self.REVIEW_MANAGER.logger.debug(
f'import_record {record["ID"]}: '
f"\n{self.REVIEW_MANAGER.pp.pformat(record)}\n\n"
)

if RecordState.md_retrieved != record["colrev_status"]:
return record

Expand Down Expand Up @@ -268,14 +264,11 @@ def import_record(*, record: dict) -> dict:

RECORD = LoadRecord(data=record)
RECORD.import_provenance()
record = RECORD.get_data()

record.update(colrev_status=RecordState.md_imported)
RECORD.set_status(target_state=RecordState.md_imported)

return record
return RECORD.get_data()

def get_nr_in_bib(*, file_path: Path) -> int:

number_in_bib = 0
with open(file_path, encoding="utf8") as f:
line = f.readline()
Expand All @@ -286,7 +279,6 @@ def get_nr_in_bib(*, file_path: Path) -> int:
if "@comment" not in line[:10].lower():
number_in_bib += 1
line = f.readline()

return number_in_bib

if SOURCE.corresponding_bib_file.is_file():
Expand Down Expand Up @@ -327,9 +319,7 @@ def get_nr_in_bib(*, file_path: Path) -> int:
# Drop empty fields
record = {k: v for k, v in record.items() if v}

if "colrev_status" not in record:
record.update(colrev_status=RecordState.md_retrieved)
elif record["colrev_status"] in [
if record.get("colrev_status", "") in [
str(RecordState.md_processed),
str(RecordState.rev_prescreen_included),
str(RecordState.rev_prescreen_excluded),
Expand All @@ -343,7 +333,9 @@ def get_nr_in_bib(*, file_path: Path) -> int:
]:
# Note : when importing a record, it always needs to be
# deduplicated against the other records in the repository
record["colrev_status"] = RecordState.md_prepared
record.update(colrev_status=RecordState.md_prepared)
else:
record.update(colrev_status=RecordState.md_retrieved)

if "doi" in record:
record.update(
Expand Down Expand Up @@ -383,7 +375,6 @@ def get_nr_in_bib(*, file_path: Path) -> int:
sr["ID"] = next_unique_ID
records[sr["ID"]] = sr

self.REVIEW_MANAGER.logger.info("Save records to references.bib")
self.REVIEW_MANAGER.REVIEW_DATASET.save_records_dict(records=records)

if not keep_ids:
Expand All @@ -398,6 +389,7 @@ def get_nr_in_bib(*, file_path: Path) -> int:
path=str(SOURCE.corresponding_bib_file)
)
self.REVIEW_MANAGER.REVIEW_DATASET.add_changes(path=str(SOURCE.filename))
self.REVIEW_MANAGER.REVIEW_DATASET.add_record_changes()

return

Expand All @@ -410,15 +402,10 @@ def validate_load(self, *, SOURCE) -> None:
imported = len_after - SOURCE.len_before

if imported != SOURCE.to_import:

origins_to_import = [o["colrev_origin"] for o in SOURCE.source_records_list]

# self.REVIEW_MANAGER.pp.pprint(source_records_list)
# print(origins_to_import)
# self.REVIEW_MANAGER.pp.pprint(imported_origins)

self.REVIEW_MANAGER.logger.error(f"len_before: {SOURCE.len_before}")
self.REVIEW_MANAGER.logger.error(f"len_after: {len_after}")

origins_to_import = [o["colrev_origin"] for o in SOURCE.source_records_list]
if SOURCE.to_import - imported > 0:
self.REVIEW_MANAGER.logger.error(
f"PROBLEM: delta: {SOURCE.to_import - imported} records missing"
Expand Down Expand Up @@ -500,22 +487,26 @@ def main(self, *, keep_ids: bool = False, combine_commits=False) -> None:
# TODO : keep_ids as a potential parameter for the source/settings?
del saved_args["keep_ids"]

self.REVIEW_MANAGER.REVIEW_DATASET.check_sources()

for SOURCE in self.REVIEW_MANAGER.settings.search.sources:

if SOURCE.script["endpoint"] not in list(self.load_scripts.keys()):
if self.verbose:
print(f"Error: endpoint not available: {SOURCE.script}")
continue
REVIEW_DATASET = self.REVIEW_MANAGER.REVIEW_DATASET

def load_active_sources() -> list:
REVIEW_DATASET.check_sources()
SOURCES = []
for SOURCE in self.REVIEW_MANAGER.settings.search.sources:
if SOURCE.script["endpoint"] not in list(self.load_scripts.keys()):
if self.verbose:
print(f"Error: endpoint not available: {SOURCE.script}")
continue
SOURCE.corresponding_bib_file = SOURCE.filename.with_suffix(".bib")
imported_origins = REVIEW_DATASET.get_currently_imported_origin_list()
SOURCE.imported_origins = imported_origins
SOURCE.len_before = len(SOURCE.imported_origins)
SOURCES.append(SOURCE)
return SOURCES

for SOURCE in load_active_sources():
self.REVIEW_MANAGER.logger.info(f"Loading {SOURCE}")
saved_args["file"] = SOURCE.filename.name
SOURCE.corresponding_bib_file = SOURCE.filename.with_suffix(".bib")
SOURCE.imported_origins = (
self.REVIEW_MANAGER.REVIEW_DATASET.get_currently_imported_origin_list()
)
SOURCE.len_before = len(SOURCE.imported_origins)

# 1. convert to bib (if necessary)
ENDPOINT = self.load_scripts[SOURCE.script["endpoint"]]["endpoint"]
Expand All @@ -533,15 +524,13 @@ def main(self, *, keep_ids: bool = False, combine_commits=False) -> None:
self.validate_load(SOURCE=SOURCE)

if not combine_commits:
self.REVIEW_MANAGER.REVIEW_DATASET.add_record_changes()
self.REVIEW_MANAGER.create_commit(
msg=f"Load {saved_args['file']}",
script_call="colrev load",
saved_args=saved_args,
)

if combine_commits and self.REVIEW_MANAGER.REVIEW_DATASET.has_changes():
self.REVIEW_MANAGER.REVIEW_DATASET.add_record_changes()
if combine_commits and REVIEW_DATASET.has_changes():
self.REVIEW_MANAGER.create_commit(
msg="Load (multiple)", script_call="colrev load", saved_args=saved_args
)
Expand Down
Loading

0 comments on commit 709cf61

Please sign in to comment.