Skip to content

Commit

Permalink
fix(verse_stats): Support <speakers> with markup inside
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Apr 15, 2022
1 parent ac0f4d2 commit 124ad29
Showing 1 changed file with 48 additions and 39 deletions.
87 changes: 48 additions & 39 deletions utils/verse_stats.py
@@ -1,5 +1,16 @@
#!/usr/bin/env python3

"""
Extracts per-verse information from the edition and writes it to a CSV file.
This script creates a CSV file with a row for each 'line' of the edition. A 'line' is, essentially,
anything that may have an apparatus: A verse, a part of an antilabial verse, a stage direction,
a speaker name etc. The CSV file contains the number of variants and witnesses (from the text view),
the number of relevant paralipomena (from the genetic bargraph) and a few data directly extracted from
the TEI file. See the source code of the class Verse for details.
"""


from __future__ import annotations

Expand All @@ -22,43 +33,6 @@
'xh': 'http://www.w3.org/1999/html'}


def first(it: Iterable, default=None):
try:
return next(iter(it))
except StopIteration:
return default


def normalize_space(s: str, ignore_missing=True):
if ignore_missing and s is None:
return None
return " ".join(s.split())


def parse_bargraph_info(data) -> dict[int, dict[str, set[str]]]:
"""
Reads and reorders the bargraph json.
Parameters
----------
fn: Path to the bargraph json file
Returns
-------
Dictionary verse no -> type -> set of sigils
"""
verses = defaultdict(lambda: defaultdict(set))
for doc in data:
sigil = doc['sigil']
for interval in doc['intervals']:
kind = interval['type']
for n in range(interval['start'], interval['end'] + 1):
verses[str(n)][kind].add(sigil)

return verses


@dataclass
class Verse:
"""Represents a single line. Directly maps to the CSV file."""
Expand Down Expand Up @@ -129,8 +103,7 @@ def lines(self):
el_h = self.html.xpath(f'//*[@data-n="{n_h}"]', namespaces=_ns)[0]
variants = int(el_h.get('data-variants'))
witnesses = int(el_h.get('data-varcount'))
speaker = normalize_space(
first(el_t.xpath('ancestor::tei:sp//tei:speaker/text()', namespaces=_ns), default=None))
speaker = normalize_space(''.join(el_t.xpath('ancestor::tei:sp//tei:speaker//text()', namespaces=_ns)))
v = Verse(n, variants, witnesses,
paralipomena=len(self.bargraph[n]['paralipomena']),
paralipomena_uncertain=len(self.bargraph[n]['paralipomena_uncertain']),
Expand All @@ -144,6 +117,42 @@ def lines(self):
yield v


def first(it: Iterable, default=None):
try:
return next(iter(it))
except StopIteration:
return default


def normalize_space(s: str, ignore_missing=True):
if ignore_missing and s is None:
return None
return " ".join(s.split())


def parse_bargraph_info(data) -> dict[int, dict[str, set[str]]]:
"""
Reads and reorders the bargraph json.
Parameters
----------
fn: Path to the bargraph json file
Returns
-------
Dictionary verse no -> type -> set of sigils
"""
verses = defaultdict(lambda: defaultdict(set))
for doc in data:
sigil = doc['sigil']
for interval in doc['intervals']:
kind = interval['type']
for n in range(interval['start'], interval['end'] + 1):
verses[str(n)][kind].add(sigil)

return verses

def getargparser():
p = ArgumentParser(description=__doc__)
p.add_argument('edition', nargs='?',
Expand Down

0 comments on commit 124ad29

Please sign in to comment.