In [35]:
import pandas as pd
import duckdb
conn = duckdb.connect('/srv/data/greek/grela.duckdb', read_only=True)

In [26]:
query = '''
SELECT *
FROM sentences
WHERE grela_id LIKE 'lagt_%'
ORDER BY RANDOM()
LIMIT 10000
'''
df = conn.execute(query).fetchdf()
df

Unnamed: 0,sentence_id,grela_id,position,text
0,lagt_tlg2042.tlg058_1505,lagt_tlg2042.tlg058,1505,"εἰς ἀγγεῖα ὀστράκινα, ἔργα χειρῶν κεραμέως"
1,lagt_tlg2036.tlg001_3583,lagt_tlg2036.tlg001,3583,Πλωτῖνος ἀπορεῖ] . .
2,lagt_tlg0018.tlg024_11865,lagt_tlg0018.tlg024,11865,κἀν ( . .): καί .
3,lagt_tlg2021.tlg002_5869,lagt_tlg2021.tlg002,5869,·
4,lagt_tlg0627.tlg036_3119,lagt_tlg0627.tlg036,3119,"ὀριγάνου φύλλα τρίψας ὡς λειότατα, ἢν μὲν ἔχῃ,..."
...,...,...,...,...
9995,lagt_tlg0087.tlg001_26008,lagt_tlg0087.tlg001,26008,"τοῖς δέ ἐκτείνουσι τό ῑ, καί μάλιστα ἐπί δισυλ..."
9996,lagt_tlg2042.tlg021_5511,lagt_tlg2042.tlg021,5511,", . ."
9997,lagt_tlg0007.tlg084a_582,lagt_tlg0007.tlg084a,582,πλήρης] .
9998,lagt_tlg2021.tlg001_693,lagt_tlg2021.tlg001,693,πόσῳ μᾶλλον ὁ θεός καί πατήρ οὐκ ἄν βουληθείη ...


In [27]:
category_names = [
    "violence, troublemaking",
    "moral depravity",
    "idolatry, heresy, magic",
    "falseness, hypocrisy, inflated self-esteem",
    "evil/dubious agents",
]

In [28]:
import torch
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from transformers.pipelines import pipeline

torch.set_num_threads(8)

embedding_model = pipeline(
    "feature-extraction",
    model="FacebookAI/xlm-roberta-base",
    device=-1  # Use CPU
)

topic_model = BERTopic(verbose=True, 
                        embedding_model=embedding_model,
                        min_topic_size=15,
                        zeroshot_topic_list=category_names,
                        zeroshot_min_similarity=.85,
                        representation_model=KeyBERTInspired())
topics, probs = topic_model.fit_transform(df['text'], )

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu
2025-07-22 15:24:26,739 - BERTopic - Embedding - Transforming documents to embeddings.
100%|██████████| 10000/10000 [03:00<00:00, 55.41it/s]
2025-07-22 15:27:27,225 - BERTopic - Embedding - Completed ✓
2025-07-22 15:27:27,227 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-22 15:27:38,561 - BERTopic - Dimensionality - Completed ✓
2025-07-22 15:27:38,563 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2025-07-22 15:27:40,347 - BERTopic - Zeroshot Step 1 - Completed ✓
2025-07-22 15:27:40,354 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-22 15:28:05,919 - BERTopic - Representation - Completed ✓


In [43]:
print(f'{topic_model.get_topic_info()['Count'][0]/len(df)}% are unlabeled outliers.')
topic_model.get_topic_info()

0.7599% are unlabeled outliers.


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,7599,0_ἐκ_ἀλλά_περὶ_διὰ,"[ἐκ, ἀλλά, περὶ, διὰ, κατὰ, πρὸς, οὕτως, ἀλλὰ,...",[ὥσπερ γὰρ ἐπὶ πάντων τῶν γινομένων ἐστί τι τὸ...
1,1,1317,1_ἐναντιοῦνται_οὕτως_ἀποδεικτόν_ἀκούσιος,"[ἐναντιοῦνται, οὕτως, ἀποδεικτόν, ἀκούσιος, ἐλ...","[καί ., καί, καί .]"
2,2,474,2_ἐξἀνάγκης_ὠριγένουσ_θῆλυς_ἐτελειώθη,"[ἐξἀνάγκης, ὠριγένουσ, θῆλυς, ἐτελειώθη, πάντα...","[τί δέ; E, ι καί ] τό ., τό δέ . . . . ; .]"
3,3,357,3_οὑν_οὕτως_ἀρχίλοχος_οὐδαμῶς,"[οὑν, οὕτως, ἀρχίλοχος, οὐδαμῶς, ἑκάστην, στεφ...","[τό] εἰς τό ., τό Β τῷ Γ, τό δ — η β — θ .]"
4,4,253,4_ἀποφατικά_διώκει_ἀληθές_διπλότερον,"[ἀποφατικά, διώκει, ἀληθές, διπλότερον, ἔχον, ...","[Ε ἐστι] ., παρόν ἐστι,, Κολαστικόν καί τιμωρη..."


In [None]:
topic_model.save("../data/large-data/topic_model")



In [None]:
topic_model.load("../data/large-data/topic_model")

In [None]:
labeled_text_df = topic_model.get_document_info(df['text'])
labeled_text_df.to_pickle("../data/large-data/labeled_text_df.pkl")

In [41]:
labeled_text_df = pd.read_pickle("../data/large-data/labeled_text_df.pkl")

In [42]:
topic_model.visualize_topics()

Zjeveni = lagt_tlg0031.tlg027
Matous = lagt_tlg0031.tlg001

In [None]:
query = '''
SELECT *
FROM works
WHERE grela_id LIKE 'lagt_tlg0031.tlg001', 'lagt_tlg0031.tlg027'
'''
textus = conn.execute(query).fetchdf()
textus

Unnamed: 0,grela_source,grela_id,author,title,not_before,not_after,lagt_tlg_epithet,lagt_genre,lagt_provenience,noscemus_place,noscemus_genre,noscemus_discipline,title_short,emlap_noscemus_id,place_publication,place_geonames,author_viaf,title_viaf,date_random,token_count
0,lagt,lagt_ggm0001.ggm001,Anonymous,Anametresis Pontou,1.0,400.0,,,,,,,,,,,,,372.0,0
1,lagt,lagt_ogl0001.ogl001,Pinytus,De Epistola Pinyti ad Dionysium,101.0,200.0,[],[],christian,,,,,,,,,,135.0,109
2,lagt,lagt_pta0001.pta001,Severian of Gabala,De fide et lege naturae,400.0,409.0,,,,,,,,,,,,,403.0,0
3,lagt,lagt_pta0001.pta002,Severian of Gabala,De paenitentia et compunctione,400.0,409.0,,,,,,,,,,,,,409.0,0
4,lagt,lagt_pta0001.pta003,Severian of Gabala,In ascensionem domini nostri Iesu Christi et i...,400.0,409.0,,,,,,,,,,,,,402.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,lagt,lagt_tlg9006.tlg011,Gregorius II,Orationes XIII-XXX,1201.0,1300.0,,,,,,,,,,,,,1268.0,0
2171,lagt,lagt_tlg9006.tlg016,"Gregory II, of Cyprus, Patriarch of Constantin...",Declamatio de Atheniensium defensio (ad Libani...,1241.0,1290.0,,,,,,,,,,,,,1250.0,0
2172,lagt,lagt_tlg9007.tlg001,Appendix Proverbiorum,Appendix Proverbiorum,1201.0,1300.0,,,,,,,,,,,,,1273.0,0
2173,lagt,lagt_tlg9010.tlg001,Suda,Suidae lexicon,969.0,976.0,,,,,,,,,,,,,972.0,0
