In [1]:
import pandas as pd
import json
import pickle
import duckdb
conn = duckdb.connect('/srv/data/greek/grela.duckdb', read_only=True)

In [3]:
query = '''
SELECT t.*, w.*, s.*
FROM tokens t
JOIN works w ON t.grela_id = w.grela_id
JOIN sentences s ON t.sentence_id = s.sentence_id
WHERE w.lagt_provenience IN ('christian', 'pagan')
  AND t.pos IN ('n', 'v', 'a')
  AND (
    (w.not_before > 0 AND w.not_before < 600)
    OR (w.not_after > 0 AND w.not_after < 600)
  )
'''
df = conn.execute(query).fetchdf()
df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,sentence_id,grela_id,token_text,lemma,pos,char_start,char_end,token_id,grela_source,grela_id_1,...,place_publication,place_geonames,author_viaf,title_viaf,date_random,token_count,sentence_id_1,grela_id_2,position,text
0,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,πάντας,πᾶς,a,107,113,143512724,lagt,lagt_tlg2023.tlg006,...,,,,,263.0,45847,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,1796,"ἄγνωστοι γάρ εἰσι παντελῶς καί ἀπόρρητοι, μόνῳ..."
1,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,ἁπλουστέρους,ἁπλόος,a,124,136,143512727,lagt,lagt_tlg2023.tlg006,...,,,,,263.0,45847,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,1796,"ἄγνωστοι γάρ εἰσι παντελῶς καί ἀπόρρητοι, μόνῳ..."
2,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,νέους,νέος,a,141,146,143512729,lagt,lagt_tlg2023.tlg006,...,,,,,263.0,45847,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,1796,"ἄγνωστοι γάρ εἰσι παντελῶς καί ἀπόρρητοι, μόνῳ..."
3,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,ἐπιτηδειοτέρους,ἐπιτήδειος,a,147,162,143512730,lagt,lagt_tlg2023.tlg006,...,,,,,263.0,45847,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,1796,"ἄγνωστοι γάρ εἰσι παντελῶς καί ἀπόρρητοι, μόνῳ..."
4,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,δηλοῖ,δηλόω,v,164,169,143512732,lagt,lagt_tlg2023.tlg006,...,,,,,263.0,45847,lagt_tlg2023.tlg006_1796,lagt_tlg2023.tlg006,1796,"ἄγνωστοι γάρ εἰσι παντελῶς καί ἀπόρρητοι, μόνῳ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10043230,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,ἔργα,ἔργον,n,150,154,165283830,lagt,lagt_tlg2042.tlg028,...,,,,,134.0,5176,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,2,( ἀκούσαντες γὰρ ) γεγραμμένου ἐν τῷ νόμῳ περὶ...
10043231,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,ποιοῦντες,ποιέω,v,161,170,165283832,lagt,lagt_tlg2042.tlg028,...,,,,,134.0,5176,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,2,( ἀκούσαντες γὰρ ) γεγραμμένου ἐν τῷ νόμῳ περὶ...
10043232,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,θεαθῆναι,θεάομαι,v,181,189,165283836,lagt,lagt_tlg2042.tlg028,...,,,,,134.0,5176,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,2,( ἀκούσαντες γὰρ ) γεγραμμένου ἐν τῷ νόμῳ περὶ...
10043233,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,ἀνθρώποις,ἄνθρωπος,n,195,204,165283838,lagt,lagt_tlg2042.tlg028,...,,,,,134.0,5176,lagt_tlg2042.tlg028_2,lagt_tlg2042.tlg028,2,( ἀκούσαντες γὰρ ) γεγραμμένου ἐν τῷ νόμῳ περὶ...


In [4]:

grouped = (
    df.groupby('sentence_id').agg(
        lamma_sentence=pd.NamedAgg(column='lemma', aggfunc=lambda x: ' '.join(x)),
        token_ids=pd.NamedAgg(column='token_id', aggfunc=list),
        title=pd.NamedAgg(column='title', aggfunc='first'),
        author=pd.NamedAgg(column='author', aggfunc='first'),
        lagt_provenience=pd.NamedAgg(column='lagt_provenience', aggfunc='first'),
        not_before=pd.NamedAgg(column='not_before', aggfunc='first'),
        not_after=pd.NamedAgg(column='not_after', aggfunc='first'),
        sentence_text=pd.NamedAgg(column='text', aggfunc='first'),
        lemma_list=pd.NamedAgg(column='lemma', aggfunc=list),
        token_list=pd.NamedAgg(column='token_text', aggfunc=list),
        pos_list=pd.NamedAgg(column='pos', aggfunc=list),
        char_start_list=pd.NamedAgg(column='char_start', aggfunc=list),
        char_end_list=pd.NamedAgg(column='char_end', aggfunc=list),
    ).reset_index()
)

In [5]:
grouped['tokens'] = [
    json.dumps([
        {
            'lemma': lemma,
            'token': token,
            'pos': pos,
            'char_start': char_start,
            'char_end': char_end
        }
        for lemma, token, pos, char_start, char_end in zip(row['lemma_list'], row['token_list'], row['pos_list'], row['char_start_list'], row['char_end_list'])
    ], ensure_ascii=False)
    for _, row in grouped.iterrows()
]

grouped = grouped.drop(columns=['lemma_list', 'token_list', 'pos_list', 'char_start_list', 'char_end_list'])
grouped.head()

Unnamed: 0,sentence_id,lamma_sentence,token_ids,title,author,lagt_provenience,not_before,not_after,sentence_text,tokens
0,lagt_ogl0001.ogl001_4,πινυτός ἀντιγράφω,"[157411875, 157411876]",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,"ὁ Πινυτός ἀντιγράφων,","[{""lemma"": ""πινυτός"", ""token"": ""Πινυτός"", ""pos..."
1,lagt_ogl0001.ogl001_5,θαυμάζω ἀποδέχω διονύσιος,"[157411878, 157411881, 157411883]",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,θαυμάζει μέν καί ἀποδέχεται τόν Διονύσιον·,"[{""lemma"": ""θαυμάζω"", ""token"": ""θαυμάζει"", ""po..."
2,lagt_ogl0001.ogl001_6,ἀντιπαρακαλέω στεῤῥοτέρας μεταδίδωμι τροφή,"[157411885, 157411887, 157411890, 157411891]",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,ἀντιπαρακαλεῖ δέ στεῤῥοτέρας ἤδη ποτέ μεταδιδό...,"[{""lemma"": ""ἀντιπαρακαλέω"", ""token"": ""ἀντιπαρα..."
3,lagt_ogl0001.ogl001_7,τελειοτέρω γράμμα λαός ὑποθρέψαντα διατέλος γα...,"[157411893, 157411894, 157411899, 157411900, 1...",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,τελειοτέροις γράμμασιν εἰσαῦθις τόν παῤ αὐτῷ λ...,"[{""lemma"": ""τελειοτέρω"", ""token"": ""τελειοτέροι..."
4,lagt_ogl0001.ogl001_8,ἐπιστολή πινυτός πίστις ὀρθοδοξία φροντίς ὑπήκ...,"[157411918, 157411922, 157411925, 157411926, 1...",De Epistola Pinyti ad Dionysium,Pinytus,christian,101.0,200.0,δι᾿ ἧς ἐπιστολῆς καί ἡ τοῦ Πινυτοῦ περί τήν πί...,"[{""lemma"": ""ἐπιστολή"", ""token"": ""ἐπιστολῆς"", ""..."


In [6]:
def make_subcorpora(not_before, not_after, lagt_provenience):
    if lagt_provenience == 'christian':
        if not_before >= 0 and not_after <= 300:
            return "christian_0_300"
        elif not_before >= 300 and not_after <= 600:
            return "christian_300_600"
    elif lagt_provenience == 'pagan':
        if not_before >= 0 and not_after <= 300:
            return "pagan_0_300"
        elif not_before >= 300 and not_after <= 600:
            return "pagan_300_600"
    return None

grouped["enemy_subcorpus"] = grouped.apply(
    lambda row: make_subcorpora(row["not_before"], row["not_after"], row["lagt_provenience"]), axis=1
)

In [9]:
# store into your personal location:
grouped.to_pickle('../data/large-data/grouped_df.pkl')
# store into our shared data directory
grouped.to_pickle('/srv/data/enemy-christ/large-data/grouped_df.pkl')