In [31]:
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
import json
import os
import sys
import matplotlib.pyplot as plt
from collections import Counter
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
import duckdb

In [53]:
lexeme_df = pd.read_parquet("../data/ruland-dictionaries.parquet")

In [54]:
lexeme_df["target_len"] = lexeme_df["target_canonical"].str.split().str.len()

In [61]:
lexeme_df = lexeme_df[lexeme_df["target_relemmatized"].str.split().str.len() < 4]

In [55]:
MAX_N = 3
lexeme_df = lexeme_df[lexeme_df["target_len"] <= MAX_N].copy()
len(lexeme_df)

2850

In [33]:
conn = duckdb.connect('/srv/data/grela/grela_v0-4.duckdb', read_only=True)

In [34]:
# 128 GB RAM → give DuckDB plenty, but leave headroom for Python/OS/file cache
conn.execute("""
  SET memory_limit = '96GB';              -- or '80GB' if you run multiple jobs
  SET threads = 8;                        -- raise gradually (16/24/32) if stable
  SET preserve_insertion_order = false;
  PRAGMA temp_directory='/srv/data/duckdb_tmp';   -- fast SSD/NVMe
  PRAGMA max_temp_directory_size='2TB';          -- whatever your disk allows
""")

<_duckdb.DuckDBPyConnection at 0x7b1ee9d2d970>

In [35]:
# choose a fast, large temp dir (adjust path)
os.makedirs("/srv/data/duckdb_tmp", exist_ok=True)
conn.execute("""
  PRAGMA temp_directory='/srv/data/duckdb_tmp';
  PRAGMA max_temp_directory_size='500GB';   -- fit your disk
""")

<_duckdb.DuckDBPyConnection at 0x7b1ee9d2d970>

In [36]:
# Build a global order for ALL EMLAP tokens (with punctuation, for pretty KWIC)
conn.execute("""
CREATE OR REPLACE TEMP VIEW emlap_all_tokens AS
SELECT
  t.grela_id,
  t.sentence_id,
  s.position        AS sentence_position,
  t.token_id,
  t.token_text,
  LOWER(t.lemma)    AS lemma_lower,
  t.pos,
  t.char_start,
  t.char_end,
  ROW_NUMBER() OVER (
    PARTITION BY t.grela_id
    ORDER BY s.position, t.char_start
  ) AS global_token_pos
FROM tokens t
JOIN works w  ON t.grela_id = w.grela_id
JOIN sentences s USING (sentence_id)
WHERE w.grela_id LIKE 'emlap%';
""")

# A content-only stream for matching (no punctuation, lemma required), BUT keep the global pos
conn.execute("""
CREATE OR REPLACE TEMP VIEW emlap_content_tokens AS
SELECT
  a.*,
  ROW_NUMBER() OVER (
    PARTITION BY a.grela_id
    ORDER BY a.sentence_position, a.char_start
  ) AS global_content_pos
FROM emlap_all_tokens a
WHERE a.lemma_lower IS NOT NULL
  AND a.pos <> 'PUNCT';
""")

conn.execute("""
-- Full stream with a stable per-work order
CREATE OR REPLACE TEMP TABLE emlap_full_stream AS
SELECT
  t.grela_id,
  t.sentence_id,
  s.position AS sentence_position,
  t.token_id,
  t.token_text,
  LOWER(t.lemma) AS lemma_lower,
  t.pos,
  t.char_start,
  t.char_end,
  ROW_NUMBER() OVER (
    PARTITION BY t.grela_id
    ORDER BY s.position, t.char_start
  ) AS seq_full
FROM tokens t
JOIN works w  ON t.grela_id = w.grela_id
JOIN sentences s USING (sentence_id)
WHERE w.grela_id LIKE 'emlap%';

-- Content-only stream aligned to full stream with precomputed next hops/grams
CREATE OR REPLACE TEMP TABLE emlap_content_stream AS
WITH c AS (
  SELECT
    f.*,
    ROW_NUMBER() OVER (
      PARTITION BY f.grela_id
      ORDER BY f.sentence_position, f.char_start
    ) AS seq_content
  FROM emlap_full_stream f
  WHERE f.lemma_lower IS NOT NULL AND f.pos <> 'PUNCT'
)
SELECT
  c.*,
  LEAD(c.lemma_lower, 1) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) AS l2,
  LEAD(c.lemma_lower, 2) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) AS l3,
  LEAD(c.seq_full,     1) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) AS next1_seq_full,
  LEAD(c.seq_full,     2) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) AS next2_seq_full,
  -- optional prejoined strings to avoid CONCAT in the hot path
  CASE WHEN LEAD(c.lemma_lower,1) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) IS NOT NULL
       THEN c.lemma_lower || ' ' || LEAD(c.lemma_lower,1) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content)
  END AS n2,
  CASE WHEN LEAD(c.lemma_lower,2) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) IS NOT NULL
       THEN c.lemma_lower || ' ' || LEAD(c.lemma_lower,1) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content) || ' ' ||
            LEAD(c.lemma_lower,2) OVER (PARTITION BY c.grela_id ORDER BY c.seq_content)
  END AS n3
FROM c;
""")

<_duckdb.DuckDBPyConnection at 0x7b1ee9d2d970>

In [37]:
import re
import unicodedata
import pandas as pd
from typing import Optional

def concordance_for_target_across_sentences(
    conn,
    target_canonical: str | None,
    target_relemmatized: str | None,
    window: int = 10,
    include_tokens: bool = True,          # NEW: build/return kwic_tokens & target_sentence_tokens
    max_hits: Optional[int] = None,       # NEW: LIMIT for top rows (post-order)
    out_path: Optional[str] = None,       # NEW: stream directly to Parquet if set
):
    """
    Cross-sentence KWIC in EMLAP with strict adjacency, searching BOTH lemma and token_text
    for BOTH target_canonical and target_relemmatized. Normalizes Latin: lowercase, strip
    diacritics, æ→ae, œ→oe, j→i, v→u, condenses whitespace.

    De-duplicates hits so each (grela_id, target_sentence_id, start_seq_full) appears once,
    preferring lemma>token and canonical>relemmatized.

    Returns (per row):
      - target_phrase         : list[str]  (token_texts of the matched span)
      - target_from           : 'canonical' | 'relemmatized'
      - matched_by            : 'lemma' | 'token'
      - target_lemmata        : list[str]  (lemmata of the matched span)
      - target_token_ids      : list[int]  (token_ids of the matched span)
      - target_kwic_idx       : list[int]  (0-based positions within kwic_tokens)
      - target_sent_idx       : list[int]  (0-based positions within target_sentence_tokens)
      - grela_id              : str
      - target_sentence_id    : str
      - start_sentence_id     : str
      - end_sentence_id       : str
      - kwic_text             : str
      - kwic_tokens           : list[struct] (only if include_tokens=True; else NULL)
      - target_sentence_text  : str
      - target_sentence_tokens: list[struct] (only if include_tokens=True; else NULL)

    If out_path is provided, writes Parquet via DuckDB COPY and returns None.
    Otherwise, returns a pandas DataFrame.
    """

    # ---------- Normalization helpers ----------
    def _strip_diacritics(s: str) -> str:
        return "".join(ch for ch in unicodedata.normalize("NFKD", s) if not unicodedata.combining(ch))

    def _latin_norm(s: str | None) -> str | None:
        if not s or not isinstance(s, str):
            return None
        s = s.strip().lower()
        s = _strip_diacritics(s)
        s = (s.replace("æ", "ae").replace("œ", "oe").replace("j", "i").replace("v", "u"))
        s = s.replace("_", " ")
        s = re.sub(r"\s+", " ", s).strip()
        return s or None

    def _prep(t: str | None):
        if not t:
            return 0, ("", "", ""), ""
        words = t.split()
        if not (1 <= len(words) <= 3):
            raise ValueError("Only 1–3-word targets supported (MAX_N=3).")
        w = tuple(words + ["", "", ""])[:3]
        return len(words), w, " ".join(words)

    tc = _latin_norm(target_canonical)
    tr = _latin_norm(target_relemmatized)
    if not tc and not tr:
        raise ValueError("Provide at least one of target_canonical or target_relemmatized.")

    tc_len, (tc_w1, tc_w2, tc_w3), tc_phrase = _prep(tc)
    tr_len, (tr_w1, tr_w2, tr_w3), tr_phrase = _prep(tr)

    # ---------- SQL-side normalizer ----------
    def NORM(expr: str) -> str:
        # lower + æ/œ/j/v normalization
        return ("replace(replace(replace(replace(lower({x}), 'æ', 'ae'), 'œ', 'oe'), 'j', 'i'), 'v', 'u')"
                .format(x=expr))

    # lemma→token fallback when lemma_lower is empty
    LEMMA_OR_TOKEN_NORM = NORM("coalesce(nullif(cs.lemma_lower, ''), cs.token_text)")

    # ---------- SQL templates (heavy vs light) ----------
    # Common prelude through context + target_enrich (always needed)
    sql_core = f"""
WITH raw_matches AS (
  -- 1) lemma matches: canonical
  SELECT cs.grela_id, cs.sentence_id AS target_sentence_id, cs.seq_full AS start_seq_full,
         ?::INT AS target_len, 'lemma' AS matched_by, 'canonical' AS target_from, ?::VARCHAR AS target_phrase
  FROM emlap_content_stream cs
  WHERE ? AND (
    (? = 1 AND {LEMMA_OR_TOKEN_NORM} = ?)
    OR (? = 2 AND {NORM('cs.n2')} = ?)
    OR (? = 3 AND {NORM('cs.n3')} = ?)
  )

  UNION ALL

  -- 2) lemma matches: relemmatized
  SELECT cs.grela_id, cs.sentence_id, cs.seq_full,
         ?::INT, 'lemma', 'relemmatized', ?::VARCHAR
  FROM emlap_content_stream cs
  WHERE ? AND (
    (? = 1 AND {LEMMA_OR_TOKEN_NORM} = ?)
    OR (? = 2 AND {NORM('cs.n2')} = ?)
    OR (? = 3 AND {NORM('cs.n3')} = ?)
  )

  UNION ALL

  -- 3) token_text matches: canonical (strict adjacency)
  SELECT f1.grela_id, f1.sentence_id, f1.seq_full,
         ?::INT, 'token', 'canonical', ?::VARCHAR
  FROM emlap_full_stream f1
  LEFT JOIN emlap_full_stream f2
    ON f2.grela_id = f1.grela_id AND f2.seq_full = f1.seq_full + 1
  LEFT JOIN emlap_full_stream f3
    ON f3.grela_id = f1.grela_id AND f3.seq_full = f1.seq_full + 2
  WHERE ? AND (
    (? = 1 AND {NORM('f1.token_text')} = ?)
    OR (? = 2 AND {NORM('f1.token_text')} = ? AND {NORM('f2.token_text')} = ?)
    OR (? = 3 AND {NORM('f1.token_text')} = ? AND {NORM('f2.token_text')} = ? AND {NORM('f3.token_text')} = ?)
  )

  UNION ALL

  -- 4) token_text matches: relemmatized (strict adjacency)
  SELECT f1.grela_id, f1.sentence_id, f1.seq_full,
         ?::INT, 'token', 'relemmatized', ?::VARCHAR
  FROM emlap_full_stream f1
  LEFT JOIN emlap_full_stream f2
    ON f2.grela_id = f1.grela_id AND f2.seq_full = f1.seq_full + 1
  LEFT JOIN emlap_full_stream f3
    ON f3.grela_id = f1.grela_id AND f3.seq_full = f1.seq_full + 2
  WHERE ? AND (
    (? = 1 AND {NORM('f1.token_text')} = ?)
    OR (? = 2 AND {NORM('f1.token_text')} = ? AND {NORM('f2.token_text')} = ?)
    OR (? = 3 AND {NORM('f1.token_text')} = ? AND {NORM('f2.token_text')} = ? AND {NORM('f3.token_text')} = ?)
  )
),
ranked AS (
  SELECT *, ROW_NUMBER() OVER (
    PARTITION BY grela_id, target_sentence_id, start_seq_full
    ORDER BY
      CASE matched_by WHEN 'lemma' THEN 0 ELSE 1 END,
      CASE target_from WHEN 'canonical' THEN 0 ELSE 1 END
  ) AS rn
  FROM raw_matches
),
uniq_matches AS (
  SELECT grela_id, target_sentence_id, start_seq_full, target_len, matched_by, target_from, target_phrase
  FROM ranked
  WHERE rn = 1
),
bounds AS (
  SELECT m.grela_id, m.target_sentence_id, m.start_seq_full, m.target_len, m.matched_by, m.target_from, m.target_phrase,
         CASE m.target_len WHEN 1 THEN m.start_seq_full WHEN 2 THEN cs.next1_seq_full WHEN 3 THEN cs.next2_seq_full END AS end_seq_full
  FROM uniq_matches m
  JOIN emlap_content_stream cs
    ON cs.grela_id = m.grela_id AND cs.seq_full = m.start_seq_full
  WHERE CASE m.target_len
          WHEN 1 THEN TRUE
          WHEN 2 THEN cs.next1_seq_full = m.start_seq_full + 1
          WHEN 3 THEN cs.next2_seq_full = m.start_seq_full + 2
        END
),
context AS (
  SELECT b.grela_id, b.target_sentence_id, b.start_seq_full, b.target_len, b.matched_by, b.target_from, b.target_phrase,
         f.sentence_id, f.token_id, f.token_text, f.lemma_lower, f.pos, f.char_start, f.char_end,
         ROW_NUMBER() OVER (
           PARTITION BY b.grela_id, b.target_sentence_id, b.start_seq_full
           ORDER BY f.seq_full
         ) AS ord,
         (f.seq_full BETWEEN b.start_seq_full AND b.end_seq_full) AS is_target
  FROM bounds b
  JOIN emlap_full_stream f
    ON f.grela_id = b.grela_id
   AND f.seq_full BETWEEN (b.start_seq_full - ?) AND (b.end_seq_full + ?)
),
needed_sentences AS (
  SELECT DISTINCT grela_id, target_sentence_id AS sentence_id
  FROM context
),
sentence_map AS (
  SELECT
    e.grela_id, e.sentence_id, e.token_id,
    ROW_NUMBER() OVER (PARTITION BY e.grela_id, e.sentence_id ORDER BY e.char_start) - 1 AS sent_idx0
  FROM emlap_full_stream e
  JOIN needed_sentences n
    ON n.grela_id = e.grela_id AND n.sentence_id = e.sentence_id
),
target_enrich AS (
  SELECT
    c.grela_id, c.target_sentence_id, c.start_seq_full,
    LIST(c.token_text)   FILTER (WHERE c.is_target) AS target_phrase,
    LIST(c.lemma_lower)  FILTER (WHERE c.is_target) AS target_lemmata,
    LIST(c.token_id)     FILTER (WHERE c.is_target) AS target_token_ids,
    LIST(c.ord - 1)      FILTER (WHERE c.is_target) AS target_kwic_idx,
    LIST(sm.sent_idx0)   FILTER (WHERE c.is_target) AS target_sent_idx
  FROM context c
  LEFT JOIN sentence_map sm
    ON sm.grela_id = c.grela_id
   AND sm.sentence_id = c.sentence_id
   AND sm.token_id = c.token_id
  GROUP BY c.grela_id, c.target_sentence_id, c.start_seq_full
)
"""

    # Heavy variant (includes kwic_tokens + target_sentence_tokens)
    sql_heavy_tail = """
,agg_kwic AS (
  SELECT
    grela_id, target_sentence_id, start_seq_full,
    ANY_VALUE(target_len)   AS target_len,
    ANY_VALUE(matched_by)   AS matched_by,
    ANY_VALUE(target_from)  AS target_from,
    LIST(sentence_id ORDER BY ord)           AS window_sentence_ids,
    STRING_AGG(token_text, ' ' ORDER BY ord) AS kwic_text,
    LIST(
      STRUCT_PACK(
        token_id := token_id,
        token_text := token_text,
        lemma := lemma_lower,
        pos := pos,
        sentence_id := sentence_id,
        char_start := char_start,
        char_end := char_end
      )
      ORDER BY ord
    ) AS kwic_tokens
  FROM context
  GROUP BY grela_id, target_sentence_id, start_seq_full
),
target_sentence_texts AS (
  SELECT e.grela_id, e.sentence_id,
         STRING_AGG(e.token_text, ' ' ORDER BY e.char_start) AS sentence_text
  FROM emlap_full_stream e
  JOIN needed_sentences n
    ON n.grela_id = e.grela_id AND n.sentence_id = e.sentence_id
  GROUP BY e.grela_id, e.sentence_id
),
target_sentence_tokens AS (
  SELECT e.grela_id, e.sentence_id,
         LIST(
           STRUCT_PACK(
             token_id := e.token_id,
             token_text := e.token_text,
             lemma := e.lemma_lower,
             pos := e.pos,
             char_start := e.char_start,
             char_end := e.char_end
           )
           ORDER BY e.char_start
         ) AS sentence_tokens
  FROM emlap_full_stream e
  JOIN needed_sentences n
    ON n.grela_id = e.grela_id AND n.sentence_id = e.sentence_id
  GROUP BY e.grela_id, e.sentence_id
)
SELECT
  te.target_phrase,
  a.target_from,
  a.matched_by,
  te.target_lemmata,
  te.target_token_ids,
  te.target_kwic_idx,
  te.target_sent_idx,
  a.grela_id,
  a.target_sentence_id,
  a.window_sentence_ids[1]  AS start_sentence_id,
  a.window_sentence_ids[-1] AS end_sentence_id,
  a.kwic_text,
  a.kwic_tokens,
  tst.sentence_text         AS target_sentence_text,
  tstok.sentence_tokens     AS target_sentence_tokens
FROM agg_kwic a
JOIN target_enrich te
  ON te.grela_id = a.grela_id
 AND te.target_sentence_id = a.target_sentence_id
 AND te.start_seq_full = a.start_seq_full
LEFT JOIN target_sentence_texts  tst
  ON tst.grela_id = a.grela_id AND tst.sentence_id = a.target_sentence_id
LEFT JOIN target_sentence_tokens tstok
  ON tstok.grela_id = a.grela_id AND tstok.sentence_id = a.target_sentence_id
ORDER BY a.grela_id, a.target_sentence_id, a.start_seq_full
"""

    # Light variant (no kwic_tokens / target_sentence_tokens)
    sql_light_tail = """
,agg_kwic AS (
  SELECT
    grela_id, target_sentence_id, start_seq_full,
    ANY_VALUE(target_len)   AS target_len,
    ANY_VALUE(matched_by)   AS matched_by,
    ANY_VALUE(target_from)  AS target_from,
    LIST(sentence_id ORDER BY ord)           AS window_sentence_ids,
    STRING_AGG(token_text, ' ' ORDER BY ord) AS kwic_text
  FROM context
  GROUP BY grela_id, target_sentence_id, start_seq_full
),
target_sentence_texts AS (
  SELECT e.grela_id, e.sentence_id,
         STRING_AGG(e.token_text, ' ' ORDER BY e.char_start) AS sentence_text
  FROM emlap_full_stream e
  JOIN needed_sentences n
    ON n.grela_id = e.grela_id AND n.sentence_id = e.sentence_id
  GROUP BY e.grela_id, e.sentence_id
)
SELECT
  te.target_phrase,
  a.target_from,
  a.matched_by,
  te.target_lemmata,
  te.target_token_ids,
  te.target_kwic_idx,
  te.target_sent_idx,
  a.grela_id,
  a.target_sentence_id,
  a.window_sentence_ids[1]  AS start_sentence_id,
  a.window_sentence_ids[-1] AS end_sentence_id,
  a.kwic_text,
  NULL                      AS kwic_tokens,
  tst.sentence_text         AS target_sentence_text,
  NULL                      AS target_sentence_tokens
FROM agg_kwic a
JOIN target_enrich te
  ON te.grela_id = a.grela_id
 AND te.target_sentence_id = a.target_sentence_id
 AND te.start_seq_full = a.start_seq_full
LEFT JOIN target_sentence_texts  tst
  ON tst.grela_id = a.grela_id AND tst.sentence_id = a.target_sentence_id
ORDER BY a.grela_id, a.target_sentence_id, a.start_seq_full
"""

    sql_tail = sql_heavy_tail if include_tokens else sql_light_tail
    sql = sql_core + sql_tail

    # Add LIMIT if requested
    if max_hits is not None:
        sql = sql + f"\nLIMIT {int(max_hits)}"

    # ---------- Bind parameters ----------
    params = [
        # 1) lemma canonical
        tc_len, tc_phrase or "", bool(tc),
        tc_len, (tc or ""), tc_len, (tc or ""), tc_len, (tc or ""),
        # 2) lemma relemmatized
        tr_len, tr_phrase or "", bool(tr),
        tr_len, (tr or ""), tr_len, (tr or ""), tr_len, (tr or ""),
        # 3) token canonical
        tc_len, tc_phrase or "", bool(tc),
        tc_len, tc_w1,
        tc_len, tc_w1, tc_w2,
        tc_len, tc_w1, tc_w2, tc_w3,
        # 4) token relemmatized
        tr_len, tr_phrase or "", bool(tr),
        tr_len, tr_w1,
        tr_len, tr_w1, tr_w2,
        tr_len, tr_w1, tr_w2, tr_w3,
        # window
        window, window,
    ]

    # ---------- Execute ----------
    if out_path:
        # COPY requires literal path; embed safely
        sql_nosemi = sql.rstrip().rstrip(';')
        out_quoted = "'" + out_path.replace("'", "''") + "'"
        conn.execute(f"COPY ({sql_nosemi}) TO {out_quoted} (FORMAT PARQUET);", params)
        return None
    else:
        return conn.execute(sql, params).fetch_df()

In [38]:
lexeme_df[lexeme_df["target_len"] > 1].head(5)

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_len,target_relemmatized
23,Lapis maior,[Lapis maior],lapis maior,"[Ruland1612-Lapis-maior, Ruland1612-Lapis-maio...",3,[L],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Lapis-m...",3,"[finde nach etlicher Meynung, die calcinirten ...","[None, None, id est, der Geist außgezogen auß ...","[finde nach etlicher Meynung, die calcinirten ...","[{'entry_id': 'Ruland1612-Lapis-maior', 'quote...",2,lapis magnus
24,Leo viridis,[Leo viridis],leo viridis,"[Ruland1612-Leo-viridis, Ruland1612-Leo-viridi...",3,[L],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Leo-vir...",3,"[ist Hermetis Erz Glaß ond vitriol, vnd das Bl...","[None, quorundam opinione aurum, vitriolum]","[ist Hermetis Erz Glaß ond vitriol, vnd das Bl...","[{'entry_id': 'Ruland1612-Leo-viridis', 'quote...",2,leo uiridis
25,Machina tractoria,[Machina tractoria],machina tractoria,"[Ruland1612-Machina-tractoria, Ruland1612-Mach...",3,[M],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Machina...",3,"[Haspel, Gezeugsso Derg ond Waͤsser heben., Sc...","[None, None, None]","[Haspel, Gezeugsso Derg ond Waͤsser heben., Sc...","[{'entry_id': 'Ruland1612-Machina-tractoria', ...",2,machina tractoria
30,Panum aereorum species,[Panum aereorum species],panum aereorum species,"[Ruland1612-Panum-aereorum-species-,-Vorpleits...",3,[P],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Panum-a...",3,"[Vorpleitstein, Vorpleitloch., Lech, zwir Vorp...","[None, None, None]","[Vorpleitstein, Vorpleitloch., Lech, zwir Vorp...",[{'entry_id': 'Ruland1612-Panum-aereorum-speci...,3,panis aereus species
32,Pyritae aurei coloris,[Pyritae aurei coloris],pyritae aurei coloris,[Ruland1612-Pyritae-aurei-coloris-.-1.-Pyrites...,3,[P],[],[],"[{'def': '1', 'entry_id': 'Ruland1612-Pyritae-...",3,"[1, è quibus ignis elicitur, è quibus ignis no...","[1, è quibus ignis elicitur, è quibus ignis no...",[1. Pyrites qui est solidus & colorem auri tot...,[{'entry_id': 'Ruland1612-Pyritae-aurei-colori...,3,pyritus aureus color


In [39]:
lexeme_df[lexeme_df["target_canonical"].str.startswith("mercurius")]

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_len,target_relemmatized
27,Mercurius,[Mercurius],mercurius,"[Ruland1612-Mercurius, Ruland1612-Mercurius, R...",3,[M],[],[],"[{'def': 'id est, sulphur.', 'entry_id': 'Rula...",3,"[id est, sulphur., est principium materiale, v...","[id est, sulphur., est principium materiale, v...","[id est, sulphur. Mercurius. Mercurius ist in ...","[{'entry_id': 'Ruland1612-Mercurius', 'quote':...",1,mercurius
96,Mercurius metallorum,[Mercurius metallorum],mercurius metallorum,"[Ruland1612-Mercurius-metallorum, Ruland1612-M...",2,[M],[],[],[{'def': 'Ist darauß die Natur der Coͤrper gez...,2,[Ist darauß die Natur der Coͤrper gezogen wird...,[Ist darauß die Natur der Coͤrper gezogen wird...,[Ist darauß die Natur der Coͤrper gezogen wird...,[{'entry_id': 'Ruland1612-Mercurius-metallorum...,2,mercurius metallum
1881,Mercurius argentipigmentum,[Mercurius argentipigmentum],mercurius argentipigmentum,[Ruland1612-Mercurius-argentipigmentum],1,[M],[],[],"[{'def': 'ist Schwefel victriol, Alaun Saltz-d...",1,"[ist Schwefel victriol, Alaun Saltz-dieweils d...","[ist Schwefel victriol, Alaun Saltz-dieweils d...","[ist Schwefel victriol, Alaun Saltz-dieweils d...",[{'entry_id': 'Ruland1612-Mercurius-argentipig...,2,mercurius argentipigmentum
1882,Mercurius chambar,[Mercurius chambar],mercurius chambar,[Ruland1612-Mercurius-chambar],1,[M],[],[],"[{'def': 'ist magnesia, fuessend Coͤrper und W...",1,"[ist magnesia, fuessend Coͤrper und Wasser.]","[ist magnesia, fuessend Coͤrper und Wasser.]","[ist magnesia, fuessend Coͤrper und Wasser.]","[{'entry_id': 'Ruland1612-Mercurius-chambar', ...",2,mercurius chaambar
1883,Mercurius corallinus,[Mercurius corallinus],mercurius corallinus,[Ruland1612-Mercurius-corallinus],1,[M],[],[],[{'def': 'qui per oleum ouorum & aquas alias i...,1,[qui per oleum ouorum & aquas alias in rubedin...,[qui per oleum ouorum & aquas alias in rubedin...,[qui per oleum ouorum & aquas alias in rubedin...,[],2,mercurius corallinus
1884,Mercurius crudus,[Mercurius crudus],mercurius crudus,[Ruland1612-Mercurius-crudus],1,[M],[],[],"[{'def': 'est is, qui nondum separatus est a s...",1,"[est is, qui nondum separatus est a sua matric...","[est is, qui nondum separatus est a sua matric...","[est is, qui nondum separatus est a sua matric...","[{'entry_id': 'Ruland1612-Mercurius-crudus', '...",2,mercurius crudus
1885,Mercurius crystallinus,[Mercurius crystallinus],mercurius crystallinus,"[Ruland1612-Mercurius-crystallinus,]",1,[M],[],[],[{'def': 'qui saepe sublimatus est in formam c...,1,[qui saepe sublimatus est in formam crystalli ...,[qui saepe sublimatus est in formam crystalli ...,[qui saepe sublimatus est in formam crystalli ...,[],2,mercurius crystallinus
1886,Mercurius laxus,[Mercurius laxus],mercurius laxus,[Ruland1612-Mercurius-laxus],1,[M],[],[],"[{'def': 'est turbith minerale.', 'entry_id': ...",1,[est turbith minerale.],[est turbith minerale.],[est turbith minerale.],[],2,mercurius laxus
1887,Mercurius mineralium,[Mercurius mineralium],mercurius mineralium,[Ruland1612-Mercurius-mineralium],1,[M],[],[],"[{'def': 'est oleitas, vel vnctuositas de mine...",1,"[est oleitas, vel vnctuositas de mineris auri ...","[est oleitas, vel vnctuositas de mineris auri ...","[est oleitas, vel vnctuositas de mineris auri ...",[{'entry_id': 'Ruland1612-Mercurius-mineralium...,2,mercurius mineralis
1888,Mercurius regeneratus,[Mercurius regeneratus],mercurius regeneratus,[Ruland1612-Mercurius-regeneratus],1,[M],[],[],"[{'def': 'est primum ens Mercurii.', 'entry_id...",1,[est primum ens Mercurii.],[est primum ens Mercurii.],[est primum ens Mercurii.],[],2,mercurius regenero


In [67]:
target_canonical     =  "mercurius metallorum" # "lapis philosophorum"
target_relemmatized  =  "mercurius metallum" # "lapis philosophus"

df = concordance_for_target_across_sentences(
    conn, target_canonical, target_relemmatized, window=10
)

In [68]:
df

Unnamed: 0,target_phrase,target_from,matched_by,target_lemmata,target_token_ids,target_kwic_idx,target_sent_idx,grela_id,target_sentence_id,start_sentence_id,end_sentence_id,kwic_text,kwic_tokens,target_sentence_text,target_sentence_tokens
0,"[mercurio, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[2096652, 2096653]","[10, 11]","[53, 54]",emlap_100007,emlap_100007_1188,emlap_100007_1188,emlap_100007_1189,"omnibus moribus predominatur ignis , & illud d...","[{'token_id': 2096642, 'token_text': 'omnibus'...",Uiuum causat metalla quamuis adhuc bene differ...,"[{'token_id': 2096599, 'token_text': 'Uiuum', ..."
1,"[mercurius, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[162392, 162393]","[10, 11]","[28, 29]",emlap_100010,emlap_100010_404,emlap_100010_404,emlap_100010_404,"similitudine uerae calcis communis , quia arge...","[{'token_id': 162382, 'token_text': 'similitud...",Compraehendis igitur ex supradictis rationibus...,"[{'token_id': 162364, 'token_text': 'Compraehe..."
2,"[mercurium, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[164244, 164245]","[10, 11]","[6, 7]",emlap_100010,emlap_100010_504,emlap_100010_503,emlap_100010_504,"sulphure uerba faciemus . Prius diximus , per ...","[{'token_id': 164234, 'token_text': 'sulphure'...","Prius diximus , per calcinationem , mercurium ...","[{'token_id': 164238, 'token_text': 'Prius', '..."
3,"[Mercurio, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[3471717, 3471718]","[10, 11]","[54, 55]",emlap_100011,emlap_100011_3739,emlap_100011_3739,emlap_100011_3740,"omnibus modis praedominatur ignis , & illud di...","[{'token_id': 3471707, 'token_text': 'omnibus'...","Uiuum causat metalla , quamuis unum differt ab...","[{'token_id': 3471663, 'token_text': 'Uiuum', ..."
4,"[Mercurii, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[3331131, 3331132]","[10, 11]","[3, 4]",emlap_100016,emlap_100016_5803,emlap_100016_5802,emlap_100016_5804,Quomodo argentum uiuum in sulphur conuertatur ...,"[{'token_id': 3331121, 'token_text': 'Quomodo'...",car . 137 Mercurii metallorum quomodo differant .,"[{'token_id': 3331128, 'token_text': 'car', 'l..."
5,"[Mercurio, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[3335498, 3335499]","[10, 11]","[57, 58]",emlap_100022,emlap_100022_129,emlap_100022_129,emlap_100022_130,"omnibus moribus praedominatur ignis , & illud ...","[{'token_id': 3335488, 'token_text': 'omnibus'...","Uiuum causat metalla , quamuis adhuc bene diff...","[{'token_id': 3335441, 'token_text': 'Uiuum', ..."
6,"[Mercurium, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[3339895, 3339896]","[10, 11]","[13, 14]",emlap_100022,emlap_100022_305,emlap_100022_305,emlap_100022_305,", tunc sequeretur , quod de nouo extra primam ...","[{'token_id': 3339885, 'token_text': ',', 'lem...","Si esset possibile , tunc sequeretur , quod de...","[{'token_id': 3339882, 'token_text': 'Si', 'le..."
7,"[mercurium, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[2007025, 2007026]","[10, 11]","[16, 17]",emlap_100029,emlap_100029_457,emlap_100029_457,emlap_100029_458,"eorum , quae isti minoralia uocant , quintam e...","[{'token_id': 2007015, 'token_text': 'eorum', ...",Obseruandum igitur in Elixiris insequentibus s...,"[{'token_id': 2007009, 'token_text': 'Obseruan..."
8,"[mercurius, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[1959359, 1959360]","[10, 11]","[24, 25]",emlap_100033,emlap_100033_433,emlap_100033_433,emlap_100033_433,mercurii praeparatio hec omnia in sese contine...,"[{'token_id': 1959349, 'token_text': 'mercurii...","nec alia aqua uitae metallica & mercurialis , ...","[{'token_id': 1959335, 'token_text': 'nec', 'l..."
9,"[mercurio, metallorum]",relemmatized,lemma,"[mercurius, metallum]","[1135208, 1135209]","[10, 11]","[2, 3]",emlap_100035,emlap_100035_841,emlap_100035_839,emlap_100035_842,"simul : prior , cum uulgi mercurio : secunda ,...","[{'token_id': 1135198, 'token_text': 'simul', ...","secunda , mercurio metallorum :","[{'token_id': 1135206, 'token_text': 'secunda'..."


In [42]:
lexeme_df_sample_emlap_instances = lexeme_df.sample(10, random_state=1).apply(lambda row: concordance_for_target_across_sentences(conn, row["target_canonical"], row["target_relemmatized"], window=10).to_dict("records"), axis=1)
lexeme_df_sample_emlap_instances

2813                                                   []
836     [{'target_phrase': ['capsis'], 'target_from': ...
1241    [{'target_phrase': ['Essarae'], 'target_from':...
1730                                                   []
1156    [{'target_phrase': ['dispoliare'], 'target_fro...
1164                                                   []
2625    [{'target_phrase': ['terra', 'foliata'], 'targ...
547                                                    []
535     [{'target_phrase': ['Arohot'], 'target_from': ...
2578    [{'target_phrase': ['Syphita', 'praua'], 'targ...
dtype: object

In [43]:
from pathlib import Path
import re
import pandas as pd
from pathlib import Path

outdir = Path("../data/large_files/emlap_ruland_instances/")
outdir.mkdir(exist_ok=True)

In [44]:

def safe_name(s: str | None) -> str:
    """Make a safe short filename component from a target phrase."""
    if not s or not isinstance(s, str):
        return "unknown"
    s = s.strip().lower().replace(" ", "_")
    s = re.sub(r"[^a-z0-9_]+", "", s)   # keep only safe chars
    return s or "unnamed"

In [69]:
lexeme_df["instance_fname"] = lexeme_df["target_relemmatized"].apply(lambda x: safe_name(x) + ".parquet")

In [70]:
lexeme_df[lexeme_df.duplicated(subset="instance_fname", keep=False)]

Unnamed: 0,Lemma,lemmas_all,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_len,target_relemmatized,instance_fname
83,Lapis Iudaicus,[Lapis Iudaicus],lapis iudaicus,"[Ruland1612-Lapis-Iudaicus, Ruland1612-Lapis-I...",2,[L],[],[],"[{'def': 'Thecolithos lapis Iudaicus dicitur, ...",2,"[Thecolithos lapis Iudaicus dicitur, quia in P...","[Thecolithos lapis Iudaicus dicitur, quia in P...","[Thecolithos lapis Iudaicus dicitur, quia in P...","[{'entry_id': 'Ruland1612-Lapis-Iudaicus', 'qu...",2,lapis iudaicus,lapis_iudaicus.parquet
640,Bacilla ferrea,[Bacilla ferrea],bacilla ferrea,[Ruland1612-Bacilla-ferrea],1,[B],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Bacilla...",1,[Stabeisen],[None],[Stabeisen],"[{'entry_id': 'Ruland1612-Bacilla-ferrea', 'qu...",2,bacillum ferreus,bacillum_ferreus.parquet
642,Bacillum ferreum,[Bacillum ferreum],bacillum ferreum,[Ruland1612-Bacillum-ferreum],1,[B],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Bacillu...",1,[Nagel/Nahel/ Brecheisen / Steckeisen.],[None],[Nagel/Nahel/ Brecheisen / Steckeisen.],"[{'entry_id': 'Ruland1612-Bacillum-ferreum', '...",2,bacillum ferreus,bacillum_ferreus.parquet
1127,Dens pili,[Dens pili],dens pili,[Ruland1612-Dens-pili],1,[D],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Dens-pi...",1,[Deuͤmling.],[None],[Deuͤmling.],"[{'entry_id': 'Ruland1612-Dens-pili', 'quote':...",2,dens pilum,dens_pilum.parquet
1130,Dentes pili,[Dentes pili],dentes pili,[Ruland1612-Dentes-pili],1,[D],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Dentes-...",1,[Zacken],[None],[Zacken],"[{'entry_id': 'Ruland1612-Dentes-pili', 'quote...",2,dens pilum,dens_pilum.parquet
1666,Lapides Iudaici,[Lapides Iudaici],lapides iudaici,[Ruland1612-Lapides-Iudaici],1,[L],[],[],"[{'def': '1', 'entry_id': 'Ruland1612-Lapides-...",1,[1],[1],"[1. nucleo oliuae similes, striati: qui lapide...","[{'entry_id': 'Ruland1612-Lapides-Iudaici', 'q...",2,lapis iudaicus,lapis_iudaicus.parquet
2071,Partes fodinae,[Partes fodinae],partes fodinae,[Ruland1612-Partes-fodinae],1,[P],[],[],"[{'def': 'vel cunili,', 'entry_id': 'Ruland161...",1,"[vel cunili,]","[vel cunili,]","[vel cunili, Tril.]","[{'entry_id': 'Ruland1612-Partes-fodinae', 'qu...",2,pars fodina,pars_fodina.parquet
2072,Partes fodinarum,[Partes fodinarum],partes fodinarum,[Ruland1612-Partes-fodinarum],1,[P],[],[],"[{'def': 'vel cuniculi,', 'entry_id': 'Ruland1...",1,"[vel cuniculi,]","[vel cuniculi,]","[vel cuniculi, Kuckuß.]","[{'entry_id': 'Ruland1612-Partes-fodinarum', '...",2,pars fodina,pars_fodina.parquet


In [51]:
done = []

In [56]:
len(done)

1075

In [None]:
fname = "mercurius"
tc = "mercurius metallorum"
tr = "mercurius metallum"
name = safe_name(tc)
fname = f"{name}.parquet"
fpath = str(outdir / fname)
concordance_for_target_across_sentences(conn, tc, tr, window=10, out_path=fpath)

In [71]:
for i, row in lexeme_df.iterrows():
    tc = row.get("target_canonical")
    tr = row.get("target_relemmatized")
    target = tc or tr
    name = safe_name(tc)
    fname = f"{name}.parquet"
    if fname not in done:
        fpath = str(outdir / fname)
            # streams directly to Parquet (no df = ..., no df.to_parquet)
        concordance_for_target_across_sentences(conn, tc, tr, window=10, out_path=fpath)
        print(f"[{i}] saved → {fpath}")
        done.append(fname)

[23] saved → ../data/large_files/emlap_ruland_instances/lapis_maior.parquet
[24] saved → ../data/large_files/emlap_ruland_instances/leo_viridis.parquet
[30] saved → ../data/large_files/emlap_ruland_instances/panum_aereorum_species.parquet
[32] saved → ../data/large_files/emlap_ruland_instances/pyritae_aurei_coloris.parquet
[33] saved → ../data/large_files/emlap_ruland_instances/quinta_essentia.parquet
[37] saved → ../data/large_files/emlap_ruland_instances/vectis.parquet
[38] saved → ../data/large_files/emlap_ruland_instances/vrina.parquet
[47] saved → ../data/large_files/emlap_ruland_instances/balneum_mariae.parquet
[48] saved → ../data/large_files/emlap_ruland_instances/balneum_roris.parquet
[53] saved → ../data/large_files/emlap_ruland_instances/calx_peregrinorum.parquet
[68] saved → ../data/large_files/emlap_ruland_instances/filius_vnius_diei.parquet
[72] saved → ../data/large_files/emlap_ruland_instances/flos_aeris.parquet
[79] saved → ../data/large_files/emlap_ruland_instances/in

RuntimeError: Query interrupted

In [65]:
len(os.listdir(outdir))

4282

In [54]:
#lexeme_df["emlap_instances"] = lexeme_df.apply(lambda row: concordance_for_target_across_sentences(conn, row["target_canonical"], row["target_relemmatized"], window=10).to_dict("records"), axis=1)

In [36]:
def read_hits(fname):
    try:
        path = os.path.join(outdir, fname)
        instances = pd.read_parquet(path).to_dict("records")
    except:
        instances = []
    return instances
lexeme_df["emlap_instances"] = lexeme_df["fname"].apply(read_hits)

In [37]:
lexeme_df["emlap_instances_N"] = lexeme_df["emlap_instances"].apply(len)

In [38]:
lexeme_df.sort_values("emlap_instances_N", ascending=False)[:20]

Unnamed: 0,Lemma,target_canonical,entry_ids,entry_count,types,variants,notes,senses_flat,sense_count,sense_def_all,sense_def_strict_all,sense_raw_all,translations_all,target_relemmatized,target_len,fname,emlap_instances,emlap_instances_N
479,Aqua,aqua,[Ruland1612-Aqua],1,[A],[],[1. Dieses Wasser reiniget/maschet/meitet/mach...,"[{'def': 'id est, liquor', 'entry_id': 'Ruland...",1,"[id est, liquor]","[id est, liquor]","[, id est, liquor, das Wasser ist auch immer d...","[{'entry_id': 'Ruland1612-Aqua', 'quote': 'das...",aqua,1,00479_aqua.parquet,"[{'target_phrase': 'aqua', 'target_from': 'can...",20140
40,A,a,"[Ruland1612-A, Ruland1612-A-,-Ein-Diamanttaffe...",2,[A],[],[],"[{'def': 'vide ana', 'entry_id': 'Ruland1612-A...",2,"[vide ana, , Ein Diamanttaffel.]","[vide ana, None]","[vide ana, , Ein Diamanttaffel.]","[{'entry_id': 'Ruland1612-A-,-Ein-Diamanttaffe...",a,1,00040_a.parquet,"[{'target_phrase': 'a', 'target_from': 'canoni...",13138
1082,Corpus,corpus,[Ruland1612-Corpus],1,[C],[],[],"[{'def': 'Clang', 'entry_id': 'Ruland1612-Corp...",1,[Clang],[Clang],[Clang. Buce. Der Coͤrper ist ein metallisch W...,"[{'entry_id': 'Ruland1612-Corpus', 'quote': 'D...",corpus,1,01082_corpus.parquet,"[{'target_phrase': 'corpus', 'target_from': 'c...",12498
76,Ignis,ignis,"[Ruland1612-Ignis, Ruland1612-Ignis]",2,[I],[],[],"[{'def': None, 'entry_id': 'Ruland1612-Ignis',...",2,[Ist nach etlicher Meinung das Oel sokauff der...,"[None, Ignis pro lapide philos]",[Ist nach etlicher Meinung das Oel sokauff der...,"[{'entry_id': 'Ruland1612-Ignis', 'quote': 'Is...",ignis,1,00076_ignis.parquet,"[{'target_phrase': 'ignis', 'target_from': 'ca...",11656
617,Aurum,aurum,[Ruland1612-Aurum],1,[A],[],"[Natiuum purum, quod a natura tale est, cuius ...","[{'def': 'à Germanis', 'entry_id': 'Ruland1612...",1,[à Germanis],[à Germanis],"[, à Germanis Goldtsa Chymistis dicitur Sol, &...","[{'entry_id': 'Ruland1612-Aurum', 'quote': 'Go...",aurum,1,00617_aurum.parquet,"[{'target_phrase': 'aurum', 'target_from': 'ca...",7927
537,Argentum,argentum,"[Ruland1612-Argentum-à-Chymistis-Luna,-cui-eti...",1,[A],[Argentum],[],"[{'def': 'à Chymistis Luna, cui etiam tribuitu...",1,"[à Chymistis Luna, cui etiam tribuitur]","[à Chymistis Luna, cui etiam tribuitur]","[à Chymistis Luna, cui etiam tribuitur: estque...",[{'entry_id': 'Ruland1612-Argentum-à-Chymistis...,argentum,1,00537_argentum.parquet,"[{'target_phrase': 'argentum', 'target_from': ...",7391
29,Oleum,oleum,"[Ruland1612-Oleum, Ruland1612-Oleum, Ruland161...",3,[O],[],[],"[{'def': 'id est, ignis, wirdt außgezogen von ...",3,"[id est, ignis, wirdt außgezogen von einer tro...","[id est, ignis, wirdt außgezogen von einer tro...","[id est, ignis, wirdt außgezogen von einer tro...","[{'entry_id': 'Ruland1612-Oleum', 'quote': 'wi...",oleum,1,00029_oleum.parquet,"[{'target_phrase': 'oleum', 'target_from': 'ca...",7005
2610,Spiritus,spiritus,[Ruland1612-Spiritus],1,[S],[],[],"[{'def': 'est aqua soluens è re simplici, & ac...",1,"[est aqua soluens è re simplici, & acri produc...","[est aqua soluens è re simplici, & acri produc...","[est aqua soluens è re simplici, & acri produc...","[{'entry_id': 'Ruland1612-Spiritus', 'quote': ...",spiritus,1,02610_spiritus.parquet,"[{'target_phrase': 'spiritus', 'target_from': ...",6559
97,Metallum,metallum,"[Ruland1612-Metallum, Ruland1612-Metallum]",2,[M],[],[],"[{'def': 'vena, Ertz.', 'entry_id': 'Ruland161...",2,"[vena, Ertz., id est, conflatum argentum.]","[vena, Ertz., id est, conflatum argentum.]","[vena, Ertz., id est, conflatum argentum.]","[{'entry_id': 'Ruland1612-Metallum', 'quote': ...",metallum,1,00097_metallum.parquet,"[{'target_phrase': 'metallum', 'target_from': ...",5676
10,Sulphur,sulphur,"[Ruland1612-Sulphur, Ruland1612-Sulphur, Rulan...",4,[S],[],[],"[{'def': 'chibur, vel, Albusao Arabice dicitur...",4,"[chibur, vel, Albusao Arabice dicitur, pars la...","[chibur, vel, Albusao Arabice dicitur, pars la...","[chibur, vel, Albusao Arabice dicitur, pars la...","[{'entry_id': 'Ruland1612-Sulphur', 'quote': '...",sulphur,1,00010_sulphur.parquet,"[{'target_phrase': 'sulphur', 'target_from': '...",5320


In [39]:
lexeme_df["instances_ids"] = lexeme_df["emlap_instances"].apply(lambda x: [ins["grela_id"][6:] for ins in x])

In [40]:
emlap_metadata = pd.read_csv(
        "https://raw.githubusercontent.com/CCS-ZCU/EMLAP_ETL/refs/heads/master/data/emlap_metadata.csv",
        sep=";",
    )
emlap_metadata.head(5)

Unnamed: 0,working_title,filenames,no.,is_done,is_noscemus,if_noscemus_id,AUTHORSHIP,is_one_author,#if more than 1 author skip section and choose compendium below,is_author_known,...,publisher_comments,CONTENTS,genre,subject,SOURCE OF FILE,link,source_of_file,origin_of_copy,other_notes,tokens_N
0,"Augurello, Chrysopoeia",100001_Augurello1515_Chrysopoeia_GB_Noscemus,100001,True,True,713324.0,,True,,True,...,,,didactic poem,alchemy,,https://wiki.uibk.ac.at/noscemus/Chrysopoeia,GB,Noscemus,,23718
1,"Pseudo-Lull, Secretis",100002_Pseudo-Lull1518_De secretis_naturae_MDZ...,100002,True,False,,,True,,True,...,,,treatise,"alchemy, medicine",,https://www.digitale-sammlungen.de/en/view/bsb...,MDZ,MBS,,24673
2,"Pantheus, Ars Transmutatione",100003_Pantheus1518_Ars_Transmutationis_Metall...,100003,True,False,,,True,,True,...,,,treatise,alchemy,,https://www.google.co.uk/books/edition/Ars_Tra...,GB,BL,,8646
3,"Anon, Vera alchemiae",100004_Anon1561_Verae_Alchemiae_MDZ_MBS,100004,True,False,,,True,,True,...,,,"compendium, florilegium",alchemy,,https://mdz-nbn-resolving.de/details:bsb10141168,MDZ,MBS,,3521
4,"Pantheus, Voarchadumia",100005_Pantheus1530_Voarchadumia_ONB,100005,True,False,,,True,,True,...,,,treatise,alchemy,,https://data.onb.ac.at/rep/10588E49,ONB,ONB,,20386


In [41]:
emlap_id_title_dict = dict(zip(emlap_metadata['no.'].astype(str), emlap_metadata['working_title']))
emlap_id_date_dict = dict(zip(emlap_metadata['no.'].astype(str), emlap_metadata['date_publication']))

In [42]:
def add_emlap_metadata(instances_dict_list):
    instances_dict_list_new = []
    for instance_dict in instances_dict_list:
        id = instance_dict["grela_id"][6:]
        instance_dict["title"] = emlap_id_title_dict[id]
        instance_dict["year"] = emlap_id_date_dict[id]
        instances_dict_list_new.append(instance_dict)
    return instances_dict_list_new

In [4]:
lexeme_df = pd.read_parquet("../data/large_files/ruland-emlap-grela.parquet")

In [None]:
def enrich_instances_target(instances_dict_list):
    instances_dict_list_new = []
    for instance_dict in instances_dict_list:
        target_tokens_data = instance_dict["kwic_tokens"][10:-10]

        instance_dict["target_lemmata_list"]


In [None]:
instances_enriched = lexeme_df["emlap_instances"]

In [None]:
instances_enriched = lexeme_df["emlap_instances"].apply(add_emlap_metadata)

In [None]:
lexeme_df["emlap_instances"] = instances_enriched

In [1]:
lexeme_df.head(5)

NameError: name 'lexeme_df' is not defined

In [45]:
lexeme_df.to_json("../data/large_files/ruland-emlap-grela.json")

In [46]:
lexeme_df.to_parquet("../data/large_files/ruland-emlap-grela.parquet")


In [90]:
lexeme_df.to_json("../data/large_files/ruland-emlap.json")