In [1]:
import pandas as pd
import pickle

In [2]:
# load the corpus
with open ("/srv/data/enemy-christ/large-data/grouped_df.pkl", "rb") as f:
    grouped = pickle.load(f)

In [3]:
# make your subcorpora dataframes from "enemy_subcorpus" column
christian_0_300 = grouped[grouped["enemy_subcorpus"]=="christian_0_300"]
christian_300_600 = grouped[grouped["enemy_subcorpus"]=="christian_300_600"]
pagan_0_300 = grouped[grouped["enemy_subcorpus"]=="pagan_0_300"]
pagan_300_600 = grouped[grouped["enemy_subcorpus"]=="pagan_300_600"]

In [4]:
from nltk.probability import FreqDist

fdist1 = FreqDist(word for sent in christian_0_300['lamma_sentence'] for word in sent.split())
fdist2 = FreqDist(word for sent in christian_300_600['lamma_sentence'] for word in sent.split())
fdist3 = FreqDist(word for sent in pagan_0_300['lamma_sentence'] for word in sent.split())
fdist4 = FreqDist(word for sent in pagan_300_600['lamma_sentence'] for word in sent.split())

# Find words that appear >=10 times in all subcorpora
common_words = set(
    word for word in fdist1 if fdist1[word] >= 10
) & set(
    word for word in fdist2 if fdist2[word] >= 10
) & set(
    word for word in fdist3 if fdist3[word] >= 10
) & set(
    word for word in fdist4 if fdist4[word] >= 10
)

# Filter each subcorpus to only keep those words
words1 = {word: fdist1[word] for word in common_words}
words2 = {word: fdist2[word] for word in common_words}
words3 = {word: fdist3[word] for word in common_words}
words4 = {word: fdist4[word] for word in common_words}

In [5]:
from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import RULE_KEEP, RULE_DISCARD, RULE_DEFAULT

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    def on_epoch_begin(self, model):
        print(f"Epoch #{self.epoch} start")
    def on_epoch_end(self, model):
        print(f"Epoch #{self.epoch} end")
        self.epoch += 1

def train_fasttext_on_subcorpus(sentences, model_path):
    #sentences = [s.split() for s in sentences]
    model = FastText(
        vector_size=100,
        window=10, #vojta used 10
        min_count=1,
        sg=1,
        workers=16,
    )
    model.build_vocab_from_freq(word_freq=sentences)
    model.train(
        corpus_iterable=sentences,
        total_examples=len(sentences),
        epochs=20,
        callbacks=[EpochLogger()]
    )
    model.save(model_path)
    return model


In [6]:
%%time
# Example usage for each subcorpus:
christian_0_300_model = train_fasttext_on_subcorpus(
    words1, "../data/large-data/fasttext_christian_0_300.model"
)
christian_300_600_model = train_fasttext_on_subcorpus(
    words2, "../data/large-data/fasttext_christian_300_600.model"
)
pagan_0_300_model = train_fasttext_on_subcorpus(
    words3, "../data/large-data/fasttext_pagan_0_300.model"
)
pagan_300_600_model = train_fasttext_on_subcorpus(
    words4, "../data/large-data/fasttext_pagan_300_600.model"
)

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoch #14 end
Epoch #15 start
Epoch #15 end
Epoch #16 start
Epoch #16 end
Epoch #17 start
Epoch #17 end
Epoch #18 start
Epoch #18 end
Epoch #19 start
Epoch #19 end
Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end
Epoch #10 start
Epoch #10 end
Epoch #11 start
Epoch #11 end
Epoch #12 start
Epoch #12 end
Epoch #13 start
Epoch #13 end
Epoch #14 start
Epoc

In [1]:
from gensim.models import FastText

# Load every FastText model for each subcorpus into a variable
christian_0_300_model = FastText.load("../data/large-data/fasttext_christian_0_300.model")
christian_300_600_model = FastText.load("../data/large-data/fasttext_christian_300_600.model")
pagan_0_300_model = FastText.load("../data/large-data/fasttext_pagan_0_300.model")
pagan_300_600_model = FastText.load("../data/large-data/fasttext_pagan_300_600.model")

In [7]:
%%time
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

models = [
    (christian_0_300_model, "christian_0_300", 0),
    (christian_300_600_model, "christian_300_600", 1),
    (pagan_0_300_model, "pagan_0_300", 2),
    (pagan_300_600_model, "pagan_300_600", 3),
]

dfs = []
for model, label, z in models:
    vocab = list(model.wv.index_to_key)
    vecs = [model.wv[w] for w in vocab]
    tsne = TSNE(n_components=2, random_state=42, verbose=1)
    tsne_result = tsne.fit_transform(np.array(vecs))
    df = pd.DataFrame({
        "word": vocab,
        "x": tsne_result[:, 0],
        "y": tsne_result[:, 1],
        "z": z,
        "subcorpus": label
    })
    dfs.append(df)

df_plot = pd.concat(dfs, ignore_index=True)
df_plot.to_pickle("../data/large-data/df_plot.pkl")

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3929 samples in 0.001s...
[t-SNE] Computed neighbors for 3929 samples in 0.087s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3929
[t-SNE] Computed conditional probabilities for sample 2000 / 3929
[t-SNE] Computed conditional probabilities for sample 3000 / 3929
[t-SNE] Computed conditional probabilities for sample 3929 / 3929
[t-SNE] Mean sigma: 0.003285
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.338364
[t-SNE] KL divergence after 1000 iterations: 3.307391
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3929 samples in 0.000s...
[t-SNE] Computed neighbors for 3929 samples in 0.039s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3929
[t-SNE] Computed conditional probabilities for sample 2000 / 3929
[t-SNE] Computed conditional probabilities for sample 3000 / 3929
[t-SNE] Computed conditional probabilities for sample 3929 / 3929
[t-SNE] Mean sigma: 0.003289


In [3]:
df_plot = pd.read_pickle("../data/large-data/df_plot.pkl")

In [8]:
%%time
# Precompute nearest neighbors for all words in all subcorpora and save to pickle
from collections import defaultdict

neighbors_dict = defaultdict(dict)
model_map = {
    "christian_0_300": christian_0_300_model,
    "christian_300_600": christian_300_600_model,
    "pagan_0_300": pagan_0_300_model,
    "pagan_300_600": pagan_300_600_model,
}
N_NEIGHBORS = 100

for subcorpus, model in model_map.items():
    for word in model.wv.index_to_key:
        try:
            neighbors = model.wv.most_similar(word, topn=N_NEIGHBORS)  # returns list of (word, similarity)
        except KeyError:
            neighbors = []
        neighbors_dict[subcorpus][word] = neighbors

import pickle
with open("../data/large-data/word_neighbors.pkl", "wb") as f:
    pickle.dump(dict(neighbors_dict), f)

CPU times: user 18.4 s, sys: 1min 5s, total: 1min 23s
Wall time: 2.86 s


# TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

grouped = pd.read_pickle('../data/large-data/grouped_df.pkl')

In [11]:
# make your subcorpora dataframes from "enemy_subcorpus" column
christian_0_300 = grouped[grouped["enemy_subcorpus"]=="christian_0_300"]
christian_300_600 = grouped[grouped["enemy_subcorpus"]=="christian_300_600"]
pagan_0_300 = grouped[grouped["enemy_subcorpus"]=="pagan_0_300"]
pagan_300_600 = grouped[grouped["enemy_subcorpus"]=="pagan_300_600"]

In [12]:
from nltk.probability import FreqDist

# Find most frequent words (lowercased)
fdist1 = FreqDist(word.lower() for sent in christian_0_300['lamma_sentence'] for word in sent.split())
fdist2 = FreqDist(word.lower() for sent in christian_300_600['lamma_sentence'] for word in sent.split())
fdist3 = FreqDist(word.lower() for sent in pagan_0_300['lamma_sentence'] for word in sent.split())
fdist4 = FreqDist(word.lower() for sent in pagan_300_600['lamma_sentence'] for word in sent.split())

# Find words that appear >=10 times in all subcorpora
vocab = set(
    word for word in fdist1 if fdist1[word] >= 10
) & set(
    word for word in fdist2 if fdist2[word] >= 10
) & set(
    word for word in fdist3 if fdist3[word] >= 10
) & set(
    word for word in fdist4 if fdist4[word] >= 10
)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(sentences):
    vectorizer = TfidfVectorizer(
        #min_df=10,
        #max_features=1000,
        vocabulary=vocab, #use common vocab
    )
    tfidf_matrix = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names_out()
    return [tfidf_matrix, feature_names]

In [14]:
tfidf_christian_0_300 = compute_tfidf(christian_0_300['lamma_sentence'])
tfidf_christian_300_600 = compute_tfidf(christian_300_600['lamma_sentence'])
tfidf_pagan_0_300 = compute_tfidf(pagan_0_300['lamma_sentence'])
tfidf_pagan_300_600 = compute_tfidf(pagan_300_600['lamma_sentence'])

In [15]:
import numpy as np

def compute_weighted_cooccurrence(tfidf_data):
    """
    Given [tfidf_matrix, feature_names] as returned by compute_tfidf,
    returns a DataFrame of TF-IDF weighted term-term association.
    """
    tfidf_matrix, feature_names = tfidf_data
    weighted_cooccurrence = (tfidf_matrix.T @ tfidf_matrix)
    # Remove self-association (diagonal)
    np.fill_diagonal(weighted_cooccurrence.toarray(), 0)
    return pd.DataFrame(
        weighted_cooccurrence.toarray(),
        index=feature_names,
        columns=feature_names
    )

In [16]:
import pickle

# Compute co-occurrence matrices for all subcorpora
cooc_christian_0_300 = compute_weighted_cooccurrence(tfidf_christian_0_300)
cooc_christian_300_600 = compute_weighted_cooccurrence(tfidf_christian_300_600)
cooc_pagan_0_300 = compute_weighted_cooccurrence(tfidf_pagan_0_300)
cooc_pagan_300_600 = compute_weighted_cooccurrence(tfidf_pagan_300_600)

tfidf_cooc = {
    "christian_0_300": cooc_christian_0_300,
    "christian_300_600": cooc_christian_300_600,
    "pagan_0_300": cooc_pagan_0_300,
    "pagan_300_600": cooc_pagan_300_600,
}

with open("../data/large-data/tfidf_cooc.pkl", "wb") as f:
    pickle.dump(tfidf_cooc, f)

In [20]:
def strongest_cooccurrences(tfidf_cooc, target_word, top_n=20):
    """
    Returns the top_n strongest co-occurring words with target_word from the co-occurrence DataFrame.
    """
    for subcorpus, cooc_df in tfidf_cooc.items():
        if target_word not in cooc_df.columns:
            print(f"'{target_word}' not found in vocabulary.")
            return None
    # Get co-occurrence scores for the target word, sort descending, exclude itself
    scores = cooc_df[target_word].drop(target_word).sort_values(ascending=False)
    return scores.head(top_n)

strongest_cooccurrences(tfidf_cooc, "ἐχθρός", top_n=20)

φίλος       7.418677
ποιέω       4.051001
εἰμί        3.539491
θεός        3.526939
πολέμιος    2.419786
πόλις       2.071748
ἔχω         1.945513
γίγνομαι    1.772934
δίκαιος     1.726710
ἀνήρ        1.623366
πολύς       1.445067
ἄνθρωπος    1.376607
κακός       1.362612
νομίζω      1.331194
πᾶς         1.327142
δίδωμι      1.313136
οἶδα        1.272845
δίκη        1.267826
ἥκω         1.092527
ἡγέομαι     1.092482
Name: ἐχθρός, dtype: float64

# PMI

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

grouped = pd.read_pickle('../data/large-data/grouped_df.pkl')

In [None]:
# make your subcorpora dataframes from "enemy_subcorpus" column
christian_0_300 = grouped[grouped["enemy_subcorpus"]=="christian_0_300"]
christian_300_600 = grouped[grouped["enemy_subcorpus"]=="christian_300_600"]
pagan_0_300 = grouped[grouped["enemy_subcorpus"]=="pagan_0_300"]
pagan_300_600 = grouped[grouped["enemy_subcorpus"]=="pagan_300_600"]

In [21]:
from nltk.probability import FreqDist

# Find most frequent words (lowercased)
fdist1 = FreqDist(word.lower() for sent in christian_0_300['lamma_sentence'] for word in sent.split())
fdist2 = FreqDist(word.lower() for sent in christian_300_600['lamma_sentence'] for word in sent.split())
fdist3 = FreqDist(word.lower() for sent in pagan_0_300['lamma_sentence'] for word in sent.split())
fdist4 = FreqDist(word.lower() for sent in pagan_300_600['lamma_sentence'] for word in sent.split())

# Find words that appear >=10 times in all subcorpora
vocab = set(
    word for word in fdist1 if fdist1[word] >= 10
) & set(
    word for word in fdist2 if fdist2[word] >= 10
) & set(
    word for word in fdist3 if fdist3[word] >= 10
) & set(
    word for word in fdist4 if fdist4[word] >= 10
)

In [22]:
import numpy as np
from collections import Counter
from itertools import combinations

def compute_pmi_from_sentences(sentences, vocab=vocab, min_count=10):
    # Tokenize sentences
    tokenized = [sent.split() for sent in sentences]
    # Flatten for unigram counts
    all_words = [w for sent in tokenized for w in sent]
    unigram_counts = Counter(all_words)
    # Filter vocab by min_count
    if vocab is None:
        vocab = set([w for w, c in unigram_counts.items() if c >= min_count])
    # Count co-occurrences (within sentence)
    cooc_counts = Counter()
    for sent in tokenized:
        words = [w for w in sent if w in vocab]
        for w1, w2 in combinations(sorted(set(words)), 2):
            cooc_counts[(w1, w2)] += 1
    N = len(tokenized)
    results = []
    for (w1, w2), c_xy in cooc_counts.items():
        c_x = unigram_counts[w1]
        c_y = unigram_counts[w2]
        # Probabilities
        p_x = c_x / N
        p_y = c_y / N
        p_xy = c_xy / N
        if p_xy > 0:
            pmi = np.log(p_xy / (p_x * p_y))
            results.append({'word1': w1, 'word2': w2, 'pmi': pmi, 'count': c_xy})
    return pd.DataFrame(results)

In [23]:
pmi_christian_0_300 = compute_pmi_from_sentences(christian_0_300['lamma_sentence'])
pmi_christian_300_600 = compute_pmi_from_sentences(christian_300_600['lamma_sentence'])
pmi_pagan_0_300 = compute_pmi_from_sentences(pagan_0_300['lamma_sentence'])
pmi_pagan_300_600 = compute_pmi_from_sentences(pagan_300_600['lamma_sentence'])

import pickle

pmi_dict = {
    "christian_0_300": pmi_christian_0_300,
    "christian_300_600": pmi_christian_300_600,
    "pagan_0_300": pmi_pagan_0_300,
    "pagan_300_600": pmi_pagan_300_600,
}

with open("../data/large-data/pmi_all_subcorpora.pkl", "wb") as f:
    pickle.dump(pmi_dict, f)

# text to label for zdenka
Zjeveni = lagt_tlg0031.tlg027
Matous = lagt_tlg0031.tlg001

In [None]:
query = '''
SELECT t.sentence_id, w.*, s.text
FROM tokens t
JOIN works w ON t.grela_id = w.grela_id
JOIN sentences s ON t.sentence_id = s.sentence_id
WHERE w.lagt_provenience IN ('christian', 'pagan')
  AND t.lemma IN ('ἐχθρός')
  AND (
    (w.not_before > 0 AND w.not_before < 600)
    OR (w.not_after > 0 AND w.not_after < 600)
  )
'''
df = conn.execute(query).fetchdf()
df

Unnamed: 0,sentence_id,grela_source,grela_id,author,title,not_before,not_after,lagt_tlg_epithet,lagt_genre,lagt_provenience,...,noscemus_discipline,title_short,emlap_noscemus_id,place_publication,place_geonames,author_viaf,title_viaf,date_random,token_count,text
0,lagt_tlg0545.tlg003_92,lagt,lagt_tlg0545.tlg003,Aelian,ἐκ τῶν Αἰλιανοῦ ἀγροικικῶν ἐπιστολῶν,175.0,235.0,['Sophistae'],[],pagan,...,,,,,,,,186.0,2455,τί γὰρ παθὼν ῥυθμίζεις με καὶ πρᾶον ἀποφῆναι γ...
1,lagt_tlg2200.tlg00427_48,lagt,lagt_tlg2200.tlg00427,Libanius,Oratio 27,385.0,385.0,['Rhetorici' 'Sophistae'],[],pagan,...,,,,,,,,385.0,5267,οὐδέν οὖν ἕτερον λέγεις ἤ ὅτι φιλοῦσι τούς πον...
2,lagt_tlg0031.tlg014_29,lagt,lagt_tlg0031.tlg014,Pauline literature,New Testament - 2 Thessalonians,80.0,115.0,[],[],christian,...,,,,,,,,85.0,941,"καὶ μὴ ὡς ἐχθρὸν ἡγεῖσθε, ἀλλὰ νουθετεῖτε ὡς ἀ..."
3,lagt_tlg0007.tlg070_4,lagt,lagt_tlg0007.tlg070,Plutarch,Πῶς ἄν τις διακρίνειε τὸν κόλακα τοῦ φίλου,96.0,120.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,104.0,13729,"εἰ δὲ δὴ θεῖον ἡ ἀλήθεια καὶ "" πάντων μὲν ἀγαθ..."
4,lagt_tlg0007.tlg070_30,lagt,lagt_tlg0007.tlg070,Plutarch,Πῶς ἄν τις διακρίνειε τὸν κόλακα τοῦ φίλου,96.0,120.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,104.0,13729,"ἡμεῖς δέ, εἰ μηδαμῆ μηδαμῶς ἐπαινοῦμεν τὸ "" ἐρ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3461,lagt_tlg0007.tlg027_134,lagt,lagt_tlg0007.tlg027,Plutarch,Philopoemen,68.0,120.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,87.0,6386,"ἦν μὲν γὰρ ἐστεφανωμένους ἰδεῖν, ἦν δὲ τοὺς αὐ..."
3462,lagt_tlg0007.tlg035_60,lagt,lagt_tlg0007.tlg035,Plutarch,Κίμων,96.0,114.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,101.0,6645,τὸν δ’ ὑπὸ τοῦ ψόφου ταραχθέντα καὶ σπασάμενον...
3463,lagt_tlg0007.tlg035_153,lagt,lagt_tlg0007.tlg035,Plutarch,Κίμων,96.0,114.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,101.0,6645,ἐκεῖθεν δὲ ῥᾳδίως ἐπιβῆναι Μακεδονίας καὶ πολλ...
3464,lagt_tlg0007.tlg035_195,lagt,lagt_tlg0007.tlg035,Plutarch,Κίμων,96.0,114.0,['Biographi' 'Philosophici/-ae'],[],pagan,...,,,,,,,,101.0,6645,ἡ δὲ βουλὴ τῶν πεντακοσίων πυθομένη καὶ φοβηθε...


In [6]:
# Sample 500 random christian and 500 random pagan rows
df_christian = df[df['lagt_provenience'] == 'christian'].sample(n=500, random_state=42)
df_pagan = df[df['lagt_provenience'] == 'pagan'].sample(n=500, random_state=42)

df_sampled = pd.concat([df_christian, df_pagan], ignore_index=True)
df_sampled.to_csv('../data/enemy_sample.csv')