In [21]:
import pandas as pd
import json
import pickle
import duckdb
import unicodedata
conn = duckdb.connect('/srv/data/grela/grela_v0-3.duckdb', read_only=True)

In [None]:
query = '''
SELECT t.*, w.*, s.*
FROM tokens t
JOIN works w ON t.grela_id = w.grela_id
JOIN sentences s ON t.sentence_id = s.sentence_id
WHERE w.lagt_provenience IN ('christian', 'pagan')
  -- AND t.pos IN ('NOUN', 'VERB', 'ADJ', 'PROPN')
  AND (
    (w.not_before > 0 AND w.not_before < 600)
    OR (w.not_after > 0 AND w.not_after < 600)
  )
'''
df = conn.execute(query).fetchdf()
df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [23]:

grouped = (
    df.groupby('sentence_id').agg(
        ##lamma_sentence=pd.NamedAgg(column='lemma', aggfunc=lambda x: ' '.join(x)),
        token_ids=pd.NamedAgg(column='token_id', aggfunc=list),
        title=pd.NamedAgg(column='title', aggfunc='first'),
        author=pd.NamedAgg(column='author', aggfunc='first'),
        lagt_provenience=pd.NamedAgg(column='lagt_provenience', aggfunc='first'),
        not_before=pd.NamedAgg(column='not_before', aggfunc='first'),
        not_after=pd.NamedAgg(column='not_after', aggfunc='first'),
        sentence_text=pd.NamedAgg(column='text', aggfunc='first'),
        lemma_list=pd.NamedAgg(column='lemma', aggfunc=list),
        token_list=pd.NamedAgg(column='token_text', aggfunc=list),
        pos_list=pd.NamedAgg(column='pos', aggfunc=list),
        ref_list=pd.NamedAgg(column='ref', aggfunc=list),
        char_start_list=pd.NamedAgg(column='char_start', aggfunc=list),
        char_end_list=pd.NamedAgg(column='char_end', aggfunc=list),
    ).reset_index()
)

In [24]:
len(grouped)

1169805

In [25]:
grouped['tokens'] = [
    json.dumps([
        {
            'lemma': lemma,
            'token': token,
            'pos': pos,
            'ref': ref,
            'char_start': char_start,
            'char_end': char_end
        }
        for lemma, token, pos, ref, char_start, char_end in zip(row['lemma_list'], row['token_list'], row['pos_list'], row['ref_list'], row['char_start_list'], row['char_end_list'])
    ], ensure_ascii=False)
    for _, row in grouped.iterrows()
]

grouped = grouped.drop(columns=['lemma_list', 'token_list', 'pos_list', 'ref_list', 'char_start_list', 'char_end_list'])
grouped.head()

Unnamed: 0,sentence_id,lamma_sentence,token_ids,title,author,lagt_provenience,not_before,not_after,sentence_text,tokens
0,lagt_ogl0001.ogl001_0,πινυτός ἀντίγραφον θαυμάζω ἀποδέχομαι διονύσιος,"[365056825, 365056826, 365056828, 365056831, 3...",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,"πρὸς ἣν, ὁ Πινυτὸς ἀντιγράφων, θαυμάζει μὲν κ...","[{""lemma"": ""πινυτός"", ""token"": ""Πινυτὸς"", ""pos..."
1,lagt_ogl0001.ogl001_1,ἀντιπαρακαλέω σκληρός μεταδίδωμι τροφή τέλειος...,"[365056835, 365056837, 365056840, 365056841, 3...",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,ἀντιπαρακαλεῖ δὲ στεῤῥοτέρας ἤδη ποτὲ μεταδιδό...,"[{""lemma"": ""ἀντιπαρακαλέω"", ""token"": ""ἀντιπαρα..."
2,lagt_ogl0001.ogl001_2,ἐπιστολή πινυτός πίστις εὐσέβεια φροντίς ὑπήκο...,"[365056868, 365056872, 365056875, 365056876, 3...",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,δι᾿ ἧς ἐπιστολῆς καὶ ἡ τοῦ Πινυτοῦ περὶ τὴν πί...,"[{""lemma"": ""ἐπιστολή"", ""token"": ""ἐπιστολῆς"", ""..."
3,lagt_tlg0004.tlg001_0,φιλοσοφία ἔργον ἔνιοι φημί βάρβαρος ἄρχω,"[345704070, 345704071, 345704072, 345704073, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,τὸ τῆς φιλοσοφίας ἔργον ἔνιοί φασιν ἀπὸ βαρβάρ...,"[{""lemma"": ""φιλοσοφία"", ""token"": ""φιλοσοφίας"",..."
4,lagt_tlg0004.tlg001_1,γίγνομαι Πέρσης Μάγος Βαβυλώνιος Ἀσσύριος Χαλδ...,"[345704078, 345704082, 345704083, 345704087, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,"γεγενῆσθαι γὰρ παρὰ μὲν Πέρσαις Μάγους, παρὰ δ...","[{""lemma"": ""γίγνομαι"", ""token"": ""γεγενῆσθαι"", ..."


In [26]:
# make the tokens machine-readible again...
grouped["tokens"] = grouped["tokens"].apply(eval)


# normalize token and lemma text within tokens column...
# If you want to keep diaeresis, include "\u0308".
KEEP_COMBINING = set()  # e.g., set(["\u0308"]) to keep diaeresis

def normalize_greek(text: str, *, strip_diacritics: bool = True) -> str:
    if not isinstance(text, str):
        return text
    # 1) Decompose so diacritics become combining marks
    s = unicodedata.normalize("NFD", text)

    # 2) Unify variant apostrophes/koronis to a single mark (’)
    #    This helps reduce visual variants in Greek texts
    s = (
        s.replace("\u1FBD", "’")  # Greek Koronis
        .replace("\u02BC", "’")  # Modifier Letter Apostrophe
        .replace("\u2019", "’")  # Right single quotation mark
        .replace("\u00B4", "’")  # Spacing acute accent (rarely used)
        .replace("'", "’")       # ASCII apostrophe
    )

    # 3) Optionally strip all combining marks (accents, breathings, subscripts)
    if strip_diacritics:
        s = "".join(
            ch for ch in s
            if not (unicodedata.category(ch) == "Mn" and ch not in KEEP_COMBINING)
        )

    # 4) Recompose
    return unicodedata.normalize("NFC", s)

def norm_tokens_lemmata(tokens, *, strip_diacritics: bool = True):
    out = []
    for t in tokens or []:
        if not isinstance(t, dict):
            continue
        d = dict(t)  # avoid mutating the original
        if "lemma" in d and isinstance(d["lemma"], str):
            d["lemma"] = normalize_greek(d["lemma"], strip_diacritics=strip_diacritics)
        if "token" in d and isinstance(d["token"], str):
            d["token"] = normalize_greek(d["token"], strip_diacritics=strip_diacritics)
        out.append(d)
    return out


# apply the normalization
grouped["tokens"] = grouped["tokens"].apply(lambda toks: norm_tokens_lemmata(toks, strip_diacritics=False))


In [27]:
def make_subcorpora(not_before, not_after, lagt_provenience):
    if lagt_provenience == 'christian':
        if not_before >= 0 and not_after <= 300:
            return "christian_0_300"
        elif not_before >= 300 and not_after <= 600:
            return "christian_300_600"
    elif lagt_provenience == 'pagan':
        if not_before >= 0 and not_after <= 300:
            return "pagan_0_300"
        elif not_before >= 300 and not_after <= 600:
            return "pagan_300_600"
    return None

grouped["enemy_subcorpus"] = grouped.apply(
    lambda row: make_subcorpora(row["not_before"], row["not_after"], row["lagt_provenience"]), axis=1
)

In [28]:
# store into your personal location:
grouped.to_pickle('../data/large-data/grouped_df.pkl')
# store into our shared data directory
grouped.to_pickle('/srv/data/enemy-christ/large-data/grouped_df.pkl')

In [29]:
### later tests...
grouped = pd.read_pickle('/srv/data/enemy-christ/large-data/grouped_df.pkl')

In [30]:
# Python
def detect_target(tokens, target):
    return any(t.get("lemma") == target for t in tokens)

target = 'ἐχθρός'


enemy_sentences_all = grouped[grouped["tokens"].apply(lambda x: detect_target(x, target))]

In [31]:
len(enemy_sentences_all)

3204

In [32]:
enemy_sentences_all

Unnamed: 0,sentence_id,lamma_sentence,token_ids,title,author,lagt_provenience,not_before,not_after,sentence_text,tokens,enemy_subcorpus
928,lagt_tlg0004.tlg001_1839,Μίθρας διοικητής Λυσίμαχος παρίστημι λέγω ἔοικ...,"[345728014, 345728017, 345728019, 345728020, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,Μίθρου δὲ τοῦ διοικητοῦ τοῦ Λυσιμάχου παρεστῶτ...,"[{'lemma': 'Μίθρας', 'token': 'Μίθρου', 'pos':...",pagan_0_300
1906,lagt_tlg0004.tlg001_274,ἀτυχία ῥᾴδιος φέρω ἐχθρός κακός πράσσω βλέπω,"[345707739, 345707740, 345707741, 345707745, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,"πῶς ἄν τις ἀτυχίαν ῥᾷστα φέροι, εἰ τοὺς ἐχθροὺ...","[{'lemma': 'ἀτυχία', 'token': 'ἀτυχίαν', 'pos'...",pagan_0_300
3307,lagt_tlg0004.tlg001_401,οἶδα πάσχω ἐχθρός,"[345709509, 345709515, 345709518]",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,ἴσθι γὰρ μη δ ἄλλον τινὰ πεπονθέναι τῶν ἐμοὶ ἐ...,"[{'lemma': 'οἶδα', 'token': 'ἴσθι', 'pos': 'VE...",pagan_0_300
3763,lagt_tlg0004.tlg001_4421,εἰμί ἐχθρός Ἱερώνυμος περιπατητικός μόνος ἀπαν...,"[345758466, 345758467, 345758468, 345758470, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,"οὕτω δ ἦν ἐχθρὸς Ἱερωνύμῳ τῷ περιπατητικῷ, ὡς ...","[{'lemma': 'εἰμί', 'token': 'ἦν', 'pos': 'VERB...",pagan_0_300
4110,lagt_tlg0004.tlg001_4735,ἐπαινέω τρυφή ἐχθρός παῖς φημί τρυφάω,"[345762062, 345762063, 345762065, 345762066, 3...",Βίοι καὶ γνῶμαι τῶν ἐν φιλοσοφίᾳ εὐδοκιμησάντων,Diogenes Laertius,pagan,222.0,235.0,"πρὸς τὸν ἐπαινοῦντα τρυφήν, ἐχθρῶν παῖδες, ἔφη...","[{'lemma': 'ἐπαινέω', 'token': 'ἐπαινοῦντα', '...",pagan_0_300
...,...,...,...,...,...,...,...,...,...,...,...
1168598,lagt_tlg4102.tlgX03_238,φημί ἐχθρός νικάω συνδοξασθήομαι συμβασιλεύω,"[382464978, 382464983, 382464984, 382464985, 3...",Commentarius In Apocalypsin,Catenae (Novum Testamentum),christian,501.0,600.0,φησὶ τοίνυν ὅτι οἱ τὸν ἐχθρὸν νικήσαντες συνδο...,"[{'lemma': 'φημί', 'token': 'φησὶ', 'pos': 'VE...",christian_300_600
1168657,lagt_tlg4102.tlgX03_291,δηλόω νοέω κδ πρέσβυς κοινωνός εἰμί ὑμνῳδία δύ...,"[382465887, 382465890, 382465892, 382465893, 3...",Commentarius In Apocalypsin,Catenae (Novum Testamentum),christian,501.0,600.0,Διὰ τούτου δηλοῦται καὶ τοὺς νοηθέντας ἡμῖν κδ...,"[{'lemma': 'δηλόω', 'token': 'δηλοῦται', 'pos'...",christian_300_600
1169097,lagt_tlg4102.tlgX03_688,πολύς γῆ ἐχθρός νικάω νικάω πλέος ἀγωνίζομαι θ...,"[382474073, 382474078, 382474080, 382474081, 3...",Commentarius In Apocalypsin,Catenae (Novum Testamentum),christian,501.0,600.0,πολλοὶ γὰρ τῶν ἐν τῇ γῇ τὸν ἐχθρὸν νικῶσι καὶ ...,"[{'lemma': 'πολύς', 'token': 'πολλοὶ', 'pos': ...",christian_300_600
1169439,lagt_tlg4102.tlgX03_996,καθηγεμὼν πᾶς παρανομία γίγνομαι ὑπήκοος πόλις...,"[382480355, 382480356, 382480357, 382480359, 3...",Commentarius In Apocalypsin,Catenae (Novum Testamentum),christian,501.0,600.0,ἢ πάντως καθηγεμων πάσης παρανομίας τούτοις γι...,"[{'lemma': 'καθηγεμὼν', 'token': 'καθηγεμων', ...",christian_300_600
