Skip to content

Commit

Permalink
✨ Add NormalizeFieldKeys middleware (#473)
Browse files Browse the repository at this point in the history
  • Loading branch information
Technologicat committed Feb 29, 2024
1 parent 53843c0 commit 09f8bc1
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 0 deletions.
1 change: 1 addition & 0 deletions bibtexparser/middlewares/__init__.py
@@ -1,5 +1,6 @@
from bibtexparser.middlewares.enclosing import AddEnclosingMiddleware
from bibtexparser.middlewares.enclosing import RemoveEnclosingMiddleware
from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys
from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware
from bibtexparser.middlewares.latex_encoding import LatexDecodingMiddleware
from bibtexparser.middlewares.latex_encoding import LatexEncodingMiddleware
Expand Down
52 changes: 52 additions & 0 deletions bibtexparser/middlewares/fieldkeys.py
@@ -0,0 +1,52 @@
import logging
from typing import Dict
from typing import List
from typing import Set

from bibtexparser.library import Library
from bibtexparser.model import Entry
from bibtexparser.model import Field

from .middleware import BlockMiddleware


class NormalizeFieldKeys(BlockMiddleware):
"""Normalize field keys to lowercase.
In case of conflicts (e.g. both 'author' and 'Author' exist in the same entry),
a warning is emitted, and the last value wins.
Some other middlewares, such as `SeparateCoAuthors`, assume lowercase key names.
"""

def __init__(self, allow_inplace_modification: bool = True):
super().__init__(
allow_inplace_modification=allow_inplace_modification,
allow_parallel_execution=True,
)

# docstr-coverage: inherited
def transform_entry(self, entry: Entry, library: "Library") -> Entry:
seen_normalized_keys: Set[str] = set()
new_fields_dict: Dict[str, Field] = {}
for field in entry.fields:
normalized_key: str = field.key.lower()
# if the normalized key is already present, apply "last one wins"
# otherwise preserve insertion order
# if a key is overwritten, emit a detailed warning
# if performance is a concern, we could emit a warning with only {entry.key}
# to remove "seen_normalized_keys" and this if statement
if normalized_key in seen_normalized_keys:
logging.warning(
f"NormalizeFieldKeys: in entry '{entry.key}': "
+ f"duplicate normalized key '{normalized_key}' "
+ f"(original '{field.key}'); overriding previous value"
)
seen_normalized_keys.add(normalized_key)
field.key = normalized_key
new_fields_dict[normalized_key] = field

new_fields: List[Field] = list(new_fields_dict.values())
entry.fields = new_fields

return entry
68 changes: 68 additions & 0 deletions tests/middleware_tests/test_fieldkeys.py
@@ -0,0 +1,68 @@
import re

from bibtexparser import Library
from bibtexparser.middlewares.fieldkeys import NormalizeFieldKeys
from bibtexparser.model import Entry
from bibtexparser.model import Field

entries = {
"article": {
"author": '"Smith, J."',
"title": '"A Test Article"',
"journal": '"J. of Testing"',
"month": '"jan"',
"year": '"2022"',
},
"book": {
"author": '"Doe, J."',
"title": '"A Test Book"',
"publisher": '"Test Pub."',
"year": '"2021"',
"month": "apr",
},
"inproceedings": {
"author": '"Jones, R."',
"title": '"A Test Conf. Paper"',
"booktitle": '"Proc. of the Intl. Test Conf."',
"year": '"2023"',
"month": "8",
},
}

ref = Library()
for i, (entry_type, fields) in enumerate(entries.items()):
f = [Field(key=k, value=v) for k, v in fields.items()]
ref.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f))


def test_normalize_fieldkeys():
"""
Check library with lowercase field keys.
"""

lib = Library()
for i, (entry_type, fields) in enumerate(entries.items()):
f = [Field(key=k, value=v) for k, v in fields.items()]
lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f))

lib = NormalizeFieldKeys().transform(lib)

for key in lib.entries_dict:
assert lib.entries_dict[key] == ref.entries_dict[key]


def test_normalize_fieldkeys_force_last(caplog):
"""
Check library with uppercase field keys and duplicate normalized keys.
"""
lib = Library()
for i, (entry_type, fields) in enumerate(entries.items()):
f = [Field(key=k.lower(), value="dummyvalue") for k in fields]
f += [Field(key=k.upper(), value=v) for k, v in fields.items()]
lib.add(Entry(entry_type=entry_type, key=f"entry{i}", fields=f))

lib = NormalizeFieldKeys().transform(lib)
assert re.match(r"(WARNING\s*)(\w*\:\w*\.py\:[0-9]*\s*)(NormalizeFieldKeys)(.*)", caplog.text)

for key in lib.entries_dict:
assert lib.entries_dict[key] == ref.entries_dict[key]

0 comments on commit 09f8bc1

Please sign in to comment.