Skip to content

Commit

Permalink
chore(verse_stats): radically speed up processing by caching the HTML…
Browse files Browse the repository at this point in the history
… lines
  • Loading branch information
thvitt committed Apr 16, 2022
1 parent 205c9d1 commit 0137e7d
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 3 deletions.
1 change: 1 addition & 0 deletions utils/pyproject.toml
Expand Up @@ -22,6 +22,7 @@ openpyxl = "^3.0.7"
black = "^21.7b0"
rope = "^0.19.0"
lxml-stubs = "^0.4.0"
line-profiler-pycharm = "^1.1.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
12 changes: 9 additions & 3 deletions utils/verse_stats.py
Expand Up @@ -10,8 +10,6 @@
the TEI file. See the source code of the class Verse for details.
"""


from __future__ import annotations

import csv
Expand Down Expand Up @@ -92,6 +90,14 @@ def load(self):
for note in self.tei.xpath('//tei:note[@type="textcrit"]', namespaces=_ns):
note.getparent().remove(note)

# build HTML cache for speedup
html_lines: dict[str, etree._Element] = {}
for el in self.html.xpath('//*[@data-n]', namespaces=_ns):
n = el.get('data-n')
if n not in html_lines:
html_lines[n] = el
self.html_lines = html_lines

self.loaded = True

def lines(self):
Expand All @@ -100,7 +106,7 @@ def lines(self):
for el_t in self.tei.xpath('//*[@n][not(self::tei:div)]', namespaces=_ns):
n = el_t.get('n')
n_h = n[:-1] if n[-1] in 'imf' and n[-2] != '_' else n # antilabial n's are contracted in html
el_h = self.html.xpath(f'//*[@data-n="{n_h}"]', namespaces=_ns)[0]
el_h = self.html_lines[n_h] #self.html.xpath(f'//*[@data-n="{n_h}"]', namespaces=_ns)[0]
variants = int(el_h.get('data-variants'))
witnesses = int(el_h.get('data-varcount'))
speaker = normalize_space(''.join(el_t.xpath('ancestor::tei:sp//tei:speaker//text()', namespaces=_ns)))
Expand Down

0 comments on commit 0137e7d

Please sign in to comment.