Skip to content

Commit

Permalink
Merge branch 'master' into alignment-data-export
Browse files Browse the repository at this point in the history
  • Loading branch information
fsimonjetz committed Jun 2, 2023
2 parents 41b6c36 + 48fc0b5 commit 044cf3b
Show file tree
Hide file tree
Showing 6 changed files with 555 additions and 390 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ run_backend.sh
# Migration script output
invalid_fragments.tsv
invalid_texts.tsv
fragment_import_summary_*.tsv

# Line To Vec Scores
line_to_vec_ranking_results
Expand Down
14 changes: 11 additions & 3 deletions Taskfile.dist.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,18 @@ tasks:
cmds:
- poetry run waitress-serve --port=8000 --call ebl.app:get_app

import:
validate-fragments:
cmds:
- poetry run python -m ebl.io.fragments.importer {{.CLI_ARGS}}

- poetry run python -m ebl.io.fragments.importer validate {{.CLI_ARGS}}

reindex-fragments:
cmds:
- poetry run python -m ebl.io.fragments.importer reindex {{.CLI_ARGS}}

import-fragments:
cmds:
- poetry run python -m ebl.io.fragments.importer import {{.CLI_ARGS}}

export-signs:
cmds:
- poetry run python -m ebl.io.alignment.sign_export {{.CLI_ARGS}}
180 changes: 141 additions & 39 deletions ebl/io/fragments/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,43 @@
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.domain.museum_number import MuseumNumber
from ebl.transliteration.infrastructure.collections import FRAGMENTS_COLLECTION
import pandas as pd
import datetime


def _load_json(path: str):
with open(path) as jsonfile:
try:
return json.load(jsonfile)
except json.JSONDecodeError as error:
raise ValueError(f"Invalid JSON: {path}") from error


def assert_type(obj, expected_type, prefix=""):
if not isinstance(obj, expected_type):
prefix = prefix + " " if prefix else ""
raise ValueError(
f"{prefix}Expected {expected_type} but got {type(obj)} instead"
)


def load_data(paths: Sequence[str]) -> dict:
return {file: _load_json(file) for file in paths if file.endswith(".json")}


def load_collection(path: str) -> dict:
fragments = {}

for file in paths:
if not file.endswith(".json"):
continue
if not path.endswith(".json"):
return fragments

with open(file) as jsonfile:
try:
fragments[file] = json.load(jsonfile)
except json.JSONDecodeError as error:
raise ValueError(f"Invalid JSON: {file}") from error
collection = _load_json(path)

assert_type(collection, list)

for index, data in enumerate(collection):
assert_type(data, dict)
fragments[f"{path}[{index}]"] = data

return fragments

Expand Down Expand Up @@ -112,39 +135,60 @@ def write_to_db(
)
subparsers = parser.add_subparsers(dest="subcommand", required=True)

_input_parser = argparse.ArgumentParser(
description="Parser with input", add_help=False
)
_input_parser.add_argument(
"fragments",
nargs="+",
help="Paths to JSON input files OR a single file with an array of dictionaries",
)
_input_parser.add_argument(
"--strict",
action="store_true",
help="Abort the import if invalid documents or duplicate IDs are found",
)
_output_parser = argparse.ArgumentParser(
description="Parser with output", add_help=False
)
_output_parser.add_argument(
"-db",
"--database",
choices=[DEV_DB, PROD_DB],
default=DEV_DB,
help="Toggle between development (default) and production db",
)

import_info = "Validate input files, write to the database, and reindex"
import_parser = subparsers.add_parser(
IMPORT_CMD, help=import_info, description=import_info
IMPORT_CMD,
help=import_info,
description=import_info,
parents=[_input_parser, _output_parser],
)

validation_info = "Validate input files without writing to the database"
validation_parser = subparsers.add_parser(
VALIDATION_CMD, help=validation_info, description=validation_info
VALIDATION_CMD,
help=validation_info,
description=validation_info,
parents=[_input_parser, _output_parser],
)

for parser_with_input in [import_parser, validation_parser]:
parser_with_input.add_argument(
"fragments", nargs="+", help="Paths to JSON input files"
)

index_info = "Rebuild the sort index"
index_parser = subparsers.add_parser(
INDEX_CMD, help=index_info, description=index_info
)

parser.add_argument(
"-db",
"--database",
choices=[DEV_DB, PROD_DB],
default=DEV_DB,
help="Toggle between development (default) and production db",
INDEX_CMD,
help=index_info,
description=index_info,
parents=[_output_parser],
)

def reindex_database(collection):
def _reindex_database(collection, db):
print(f"Reindexing 'fragments' collection in the {db!r} database...")
print("Calculating _sortKeys (this may take a while)...")
update_sort_keys(collection)

print("Reindexing database...")
print("Creating the sort index...")
create_sort_index(collection)

print("Sort index built successfully.")
Expand All @@ -153,30 +197,76 @@ def reindex_database(collection):

CLIENT = MongoClient(os.environ["MONGODB_URI"])
TARGET_DB = CLIENT.get_database(args.database)

COLLECTION = MongoCollection(TARGET_DB, FRAGMENTS_COLLECTION)
INVALID = set()
DUPLICATES = set()
FAILS = []
IS_COLLECTION = False

error_file = "invalid_texts.tsv"
summary_file = f"fragment_import_summary_{datetime.date.today()}.tsv"

if args.subcommand == INDEX_CMD:
reindex_database(COLLECTION)
_reindex_database(COLLECTION, args.database)
sys.exit()

print("Loading data...")
fragments = load_data(args.fragments)

if len(args.fragments) == 1:
try:
fragments = load_collection(args.fragments[0])
IS_COLLECTION = True
except Exception:
fragments = load_data(args.fragments)

else:
fragments = load_data(args.fragments)

fragment_count = len(fragments)

if fragments:
print(f"Found {len(fragments)} files.")
print(f"Found {'collection of ' * IS_COLLECTION}{fragment_count} documents.")
else:
print("No fragments found.")
sys.exit()

print("Validating...")

for filename, data in fragments.items():
validate(data, filename)
validate_id(data, filename)
ensure_unique(data, COLLECTION, filename)
if args.strict:
validate(data, filename)
validate_id(data, filename)
ensure_unique(data, COLLECTION, filename)
else:
try:
validate(data, filename)
validate_id(data, filename)
except Exception as error:
FAILS.append([filename, str(error)])
continue
try:
ensure_unique(data, COLLECTION, filename)
except Exception as error:
FAILS.append([filename, str(error)])

fail_count = len(FAILS)
df = pd.DataFrame.from_records(FAILS, columns=["File", "Error"])

if FAILS:
print(
f"Skipping {fail_count} document(s), see {os.path.abspath(error_file)} for details"
)
df.to_csv(error_file, sep="\t", index=False)

valid_count = fragment_count - fail_count

print("Validation successful.")
print(
f"Validation of {valid_count} out of {fragment_count} "
"document(s) successful."
)

if not valid_count:
sys.exit("No data to import.")

if args.subcommand == VALIDATION_CMD:
sys.exit()
Expand All @@ -194,13 +284,25 @@ def reindex_database(collection):
if input(prompt) != passphrase:
sys.exit("Aborting.")

result = write_to_db(list(fragments.values()), COLLECTION)
fragments = {
filename: data
for filename, data in fragments.items()
if filename not in set(df.File.to_list())
}
result = write_to_db(
list(fragments.values()),
COLLECTION,
)

print("Result:")
print(result)

reindex_database(COLLECTION)
_reindex_database(COLLECTION, args.database)

print("Done! Summary of added fragments:")
for filename, data in fragments.items():
print(data["_id"], filename, sep="\t")
pd.DataFrame.from_records(
[{"id": data["_id"], "file": filename} for filename, data in fragments.items()]
).to_csv(summary_file, sep="\t", index=False)

print(
f"Done! See {os.path.abspath(summary_file)} for a summary of added documents",
)
9 changes: 9 additions & 0 deletions ebl/tests/io/test_fragment_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ebl.fragmentarium.domain.fragment import Fragment
from ebl.io.fragments.importer import (
create_sort_index,
load_collection,
validate,
load_data,
validate_id,
Expand Down Expand Up @@ -58,6 +59,14 @@ def test_load_json(tmp_path):
assert load_data([path]) == {path: contents}


def test_load_collection(tmp_path):
contents = [{"test": "data"}, {"foo": "bar"}]
path = mock_json_file(json.dumps(contents), tmp_path)
assert load_collection(path) == {
f"{path}[{index}]": content for index, content in enumerate(contents)
}


def test_load_invalid_json(tmp_path):
contents = "{'broken_file': None}"
path = mock_json_file(contents, tmp_path)
Expand Down
Loading

0 comments on commit 044cf3b

Please sign in to comment.