# Analyze sentence structures

## Analyze frequency of conjunctions in total and per sentence

In [3]:
import json

text = json.load(open("../data/cleaned/cleaned_texts.json", "r"))

In [4]:
def count_conj(text_content, normalize=True):
    """Count the total number of conjunctions.
    If normalize is set to True, 
    """
    cconj = 0
    for word in text_content:
        if word["pos"] == "CCONJ":
            cconj += 1
    if normalize:    
        return round(cconj/len(text_content)*100, 2)
    else:
        return cconj

def count_sconj(text_content, normalize=True):
    """Count the total number of conjunctions per sentence.
    """
    sconj = 0
    for word in text_content:
        if word["pos"] == "SCONJ":
            sconj += 1
    if normalize:    
        return round(sconj/len(text_content)*100, 2)
    else:
        return sconj


In [5]:
import json
import pandas as pd

text = json.load(open("../data/cleaned/cleaned_texts.json", "r"))


compute_cconj_ = {}
for text_name, content in text.items():
    compute_cconj_[text_name] = count_conj(content)


compute_sconj_ = {}
for text_name, content in text.items():
    compute_sconj_[text_name] = count_sconj(content)


CCONJ = pd.DataFrame.from_dict(compute_cconj_, orient="index").sort_values(by=0)
CCONJ.columns = ["Ratio of conjunctions"]

SCONJ = pd.DataFrame.from_dict(compute_sconj_, orient="index").sort_values(by=0)
SCONJ.columns = ["Ratio of subordinate conjunctions"]

In [6]:
df = pd.concat([SCONJ, CCONJ], axis=1)

In [7]:
print(df.sort_values("Ratio of subordinate conjunctions").to_latex(float_format="{:.2f}".format))

\begin{tabular}{lrr}
\toprule
 & Ratio of subordinate conjunctions & Ratio of conjunctions \\
\midrule
MartAndrBonTert & 0.62 & 7.36 \\
MartAndrBonPrius & 1.16 & 6.61 \\
MartAndrPrieurB & 1.21 & 6.51 \\
MartAndrPrieurA & 1.46 & 6.52 \\
ActAndrPrieur & 1.65 & 5.32 \\
MartAndrBonAlt & 1.66 & 5.55 \\
ActAndrBon & 1.75 & 5.01 \\
AndrMattTisch & 1.95 & 7.19 \\
AndrMattBon & 2.00 & 6.82 \\
PAndr2 & 2.06 & 3.89 \\
AndrMattBon1115 & 2.15 & 6.80 \\
PAndr1 & 2.17 & 4.23 \\
AndrMattTisch1115 & 2.27 & 6.54 \\
\bottomrule
\end{tabular}



## Analyze verbal positions

In [8]:
# Analyze the positions of the verbs in the sentence

from spacy.matcher import Matcher

nlp = spacy.load('grc_proiel_lg')


full_lemmatized_content = {}

for text_title, text_content in text.items():
    full_lemmatized_content[text_title] = " "
    for content in text_content:
        full_lemmatized_content[text_title] += content["lemma"] + " "

non_lemmatized_content = {}


for text_title, text_content in text.items():
    non_lemmatized_content[text_title] = " "
    for content in text_content:
        non_lemmatized_content[text_title] += content["raw"] + " "


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from unicodedata import normalize


def detect_eimi_participle(text, normalized=False):
    """Detect the structure of eimi and participles.
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        has_eimi = any((token.lemma_ == normalize("NFC", "εἰμί") and token.morph.to_dict()["VerbForm"] != "Part" and token.morph.to_dict()["Tense"] == "Past") for token in sent if token.morph.to_dict().get("VerbForm"))
        has_participle = any((token.morph.to_dict()["VerbForm"] == "Part") for token in sent if (token.morph.to_dict().get("VerbForm")))
        
        if has_eimi and has_participle:
            print(f"Match: {' '.join([token.text for token in sent])}")
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches


def detect_verb_first_position(text, normalized=False):
    """Detect the number of verbs in the first position of a sentence.
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        first_token = sent[0]
        if first_token.pos_ == "VERB" and first_token.morph.to_dict()["VerbForm"] == "Fin":
            print(f"Matched sentence: {sent.text}")
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

def detect_verb_last_position(text, normalized=False):
    """Detect the number of verbs in the last position of a sentence.
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        try:
            last_token = sent[-2]
            if last_token.pos_ == "VERB" and last_token.morph.to_dict()["VerbForm"] == "Fin":
                print(f"Matched sentence: {sent.text}")
                matches += 1
        except IndexError:
            continue

    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

def detect_verb_first_part_sentence(text, normalized=False):
    """Detect the verb as the first part of sentence (1/3).
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        match_part = 0
        sentence_third = round(len(list(sent))/3)
        for token in sent[:sentence_third+1]:
            print(sent[:sentence_third+1])
            if token.pos_ == "VERB":
                match_part += 1
        if match_part:
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

def detect_verb_second_part_sentence(text, normalized=False):
    """Detect the verb as the first part of sentence (1/3).
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        match_part = 0
        sentence_third = round(len(list(sent))/3)
        for token in sent[sentence_third:2*sentence_third+1]:
            if token.pos_ == "VERB":
                match_part += 1
        if match_part:
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

def detect_verb_middle_part_sentence(text, normalized=False):
    """Detect the verb as the first part of sentence (1/3).
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        match_part = 0
        sentence_third = round(len(list(sent))/3)
        for token in sent[sentence_third:2*sentence_third+1]:
            if token.pos_ == "VERB":
                match_part += 1
        if match_part:
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

def detect_verb_last_part_sentence(text, normalized=False):
    """Detect the verb as the first part of sentence (1/3).
    """
    matches = 0
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    for sent in text.sents:
        match_part = 0
        sentence_third = round(len(list(sent))/3)
        for token in sent[2*sentence_third:]:
            if token.pos_ == "VERB":
                match_part += 1
        if match_part:
            matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

In [37]:
import spacy
from spacy.tokens import Doc



def is_genitive(token):
    """Check if a token has genitive case."""
    return "Gen" in token.morph.get("Case", [])


def detect_genitive_absolute(text,
                             normalized=True):
    """
    Finds genitive absolute constructions in an Ancient Greek spaCy Doc.
    
    Returns:
        List of strings describing the matches.
    """
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))

    doc = nlp(text)
    results = []
    matches = 0

    for sent in doc.sents:
        tokens = list(sent)
        for i in range(len(tokens) - 1):
            t1 = tokens[i]

            t2 = tokens[i+1]

            if is_genitive(t1) and is_genitive(t2):
                # VERB + NOUN or NOUN + VERB (genitive)
                if {t1.pos_, t2.pos_} == {"VERB", "NOUN"}:
                    results.append(f"[VERB + NOUN] {t1.text} + {t2.text} in: {sent.text}")
                    matches += 1

                # VERB + PRON or PRON + VERB (genitive)
                elif {t1.pos_, t2.pos_} == {"VERB", "PRON"}:
                    results.append(f"[VERB + PRON] {t1.text} + {t2.text} in: {sent.text}")
                    matches += 1

            # Check for [DET + NOUN] + VERB (all genitive)
            if (
                t1.pos_ == "DET"
                and is_genitive(t1)
                and i + 2 < len(tokens)
            ):
                t2 = tokens[i + 1]
                t3 = tokens[i + 2]

                if (
                    t2.pos_ == "NOUN" and is_genitive(t2) and
                    t3.pos_ == "VERB" and is_genitive(t3)
                ):
                    results.append(f"[DET + NOUN + VERB] {t1.text} + {t2.text}, {t3.text} in: {sent.text}")
                    matches += 1
    print("\n".join(results))
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        return matches

In [None]:

def detect_subordinate_verbs(text, normalized=True):
    """Detect subordinate verbs in the text with the following ruleset:
        - tag starts with 'V-' and VerbForm == 'Fin'
        - immediately preceded by SCONJ
        - immediately followed by '.', '·', or ';'.
    """
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    matches = 0
    doc = nlp(text)
    subverbs = []
    for sent in doc.sents:
        for i, token in enumerate(sent):
            if token.tag_.startswith("V-") and token.morph.get("VerbForm") == ["Fin"]:
                if (
                    i > 0 and sent[i - 1].pos_ == "SCONJ" and
                    i + 1 < len(sent) and sent[i + 1].lemma_ in {".", "·", ";"}
                ):
                    subverbs.append(f"Sent with subordinate verbs: {sent.text}")
                    matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        print("\n".join(subverbs))
        return matches


def detect_main_verb(text,  normalized=True):
    """Detect main verbs in the text.

    - tag starts with 'V-'
    - immediately preceded by '.', '·', ';' or CCONJ
    - immediately followed by '.', '·', ';' or CCONJ or SCONJ
    """
    text = nlp(text.replace("·", ".").replace(";",".").replace(",", "."))
    matches = 0
    doc = nlp(text)

    mainverbs = []
    for sent in doc.sents:
        for i, token in enumerate(sent):
            if token.tag_.startswith("V-"):
                before = i > 0 and (
                    sent[i - 1].lemma_ in {".", "·", ";"} or sent[i - 1].pos_ == "CCONJ"
                )
                after = i + 1 < len(sent) and (
                    sent[i + 1].lemma_ in {".", "·", ";"} or sent[i + 1].pos_ in {"CCONJ", "SCONJ"}
                )
                if before and after:
                    mainverbs.append(f"Sent with main verbs: {sent.text}")
                    matches += 1
    if normalized:
        return round(matches/len(list(text.sents))*100, 3)
    else:
        print("\n".join(mainverbs))
        return matches


In [39]:
detect_eimi_participles_ = {}
detect_eimi_participles_normalized_ = {}

for text_name, text in non_lemmatized_content.items():
    print("=====")

    print(f" Considered text: {text_name}")
    detect_eimi_participles_[text_name] = detect_eimi_participle(non_lemmatized_content[text_name])
    detect_eimi_participles_normalized_[text_name] = detect_eimi_participle(non_lemmatized_content[text_name], normalized=True)

=====
 Considered text: AndrMattTisch1115
=====
 Considered text: AndrMattTisch
Match: κατά ἐκεῖνον τὸν καιρὸν ἦσαν πάντες οἱ ἀπόστολοι ἐπὶ τὸ αὐτὸ συναχθέντες καὶ ἐμέριζον ἑαυτοῖς τὰς χώρας βάλλοντες κλή ρους .
Match: άλλὰ ἦσαν ἐσθίοντες σάρκας ἀνθρώπων καὶ πίνοντες αὐτῶν τὸ αἷμα .
Match: άλλὰ ἦν εὐχόμενος τῷ θεῷ κλαίων καὶ λέγων κύριε ἰησοῦ χριστέ .
Match: τότε οὖν ὁ ματθείας ἐκαθέσθη ἐν τῇ φυλακῇ καὶ ἦν ψάλλων .
Match: ἐφάνη ὁ κύριος ἐν τῇ χώρᾳ ทἡ ἦν διδάσκων ὁ ᾿ανδρέας .
Match: καὶ ἦσαν ἐν τῷ πλοίῳ καθεζόμενοι .
Match: ἦν γὰρ ὁ ἰησοῦς κρύψας τὴν ἑαυτοῦ θεότητα .
Match: καὶ ἦν φαινόμενος τῷ ᾿ανδρέᾳ ὡς ἄνθρωπος πρωρεύς .
Match: οὐκ ἦν γὰρ κοιμώμενος .
Match: ἦσαν δὲ οἱ ἐσθίοντες πεντακισχίλιοι ἄνδρες καὶ ἐχορτάσθησαν .
Match: καὶ ἀπέθετο αὐτοὺς ἡ νεφέλη ἐν τῷ ὄρει ὅπου ἦν ὁ πέτρος διδάσκων .
Match: καὶ ἦν κλίβανος ᾠκοδομημένος ἐν μέσῳ τῆς πόλεως .
Match: ἔθος γὰρ ἦν ἐν τῇ πόλει ἐκείνῃ καὶ τοὺς τελευτώντας οὐκ ἐνεταφίαζον .
Match: ἦν δὲ ὁ ᾿ανδρέας θεωρῶν τὸν διάβολον πῶς ὡμίλει τοῖς ὄ

In [13]:
detected_eimi = pd.DataFrame.from_dict(detect_eimi_participles_, orient="index")
detected_eimi_normalized = pd.DataFrame.from_dict(detect_eimi_participles_normalized_, orient="index")


print(pd.concat([detected_eimi, detected_eimi_normalized], axis=1).to_latex())

\begin{tabular}{lrr}
\toprule
 & 0 & 0 \\
\midrule
AndrMattTisch1115 & 0 & 0.000000 \\
AndrMattTisch & 21 & 3.004000 \\
MartAndrBonAlt & 2 & 1.429000 \\
MartAndrBonPrius & 0 & 0.000000 \\
PAndr2 & 3 & 0.741000 \\
AndrMattBon & 21 & 2.561000 \\
PAndr1 & 3 & 1.128000 \\
AndrMattBon1115 & 0 & 0.000000 \\
ActAndrPrieur & 7 & 0.713000 \\
MartAndrPrieurB & 0 & 0.000000 \\
MartAndrBonTert & 3 & 0.670000 \\
MartAndrPrieurA & 2 & 0.551000 \\
ActAndrBon & 2 & 0.533000 \\
\bottomrule
\end{tabular}



In [14]:
detect_verb_first_position_ = {}
detect_verb_first_position_normalized_ = {}
detect_verb_last_position_ = {}
detect_verb_last_position_normalized_ = {}

detect_verb_first_sentence_part_ = {}
detect_verb_first_sentence_part_normalized_ = {}
detect_verb_middle_sentence_part_ = {}
detect_verb_middle_sentence_part_normalized_ = {}
detect_verb_last_sentence_part_ = {}
detect_verb_last_sentence_part_normalized_ = {}



for text_name, text in non_lemmatized_content.items():
    detect_verb_first_position_[text_name] = detect_verb_first_position(non_lemmatized_content[text_name])
    detect_verb_first_position_normalized_[text_name] = detect_verb_first_position(non_lemmatized_content[text_name],normalized=True)

    detect_verb_last_position_[text_name] = detect_verb_last_position(non_lemmatized_content[text_name])
    detect_verb_last_position_normalized_[text_name] = detect_verb_last_position(non_lemmatized_content[text_name],normalized=True)

    detect_verb_first_sentence_part_[text_name] = detect_verb_first_part_sentence(non_lemmatized_content[text_name])
    detect_verb_first_sentence_part_normalized_[text_name] = detect_verb_first_part_sentence(non_lemmatized_content[text_name],normalized=True)

    detect_verb_middle_sentence_part_[text_name] = detect_verb_middle_part_sentence(non_lemmatized_content[text_name])
    detect_verb_middle_sentence_part_normalized_[text_name] = detect_verb_middle_part_sentence(non_lemmatized_content[text_name],normalized=True)

    detect_verb_last_sentence_part_[text_name] = detect_verb_last_part_sentence(non_lemmatized_content[text_name])
    detect_verb_last_sentence_part_normalized_[text_name] = detect_verb_last_part_sentence(non_lemmatized_content[text_name],normalized=True)


Matched sentence: ἐποίησεν καὶ ἐνώπιον τῶν ἀρχιερέων .
Matched sentence: φανέρωσόν μοι αὐτάς .
Matched sentence: εγένετο πορευομένων ἡμῶν τῶν δώδεκα μαθητῶν μετὰ τοῦ κυρίου ἡμῶν εἰς ἱερὸν τῶν ἐθνῶν ἵνα γνωρίσῃ ἡμῖν τὴν ἄγνοιαν τοῦ διαβόλου .
Matched sentence: ἐστράφησαν αἱ καρδίαι ἡμῶν εἰς ἀσθένεια». γνοὺς δὲ ὁ ἰησοῦς ὅτι ἐξέκλιναν αἱ καρδίαι ἡμῶν .
Matched sentence: εἶπεν αὐτῇ σοὶ λέγω .
Matched sentence: ἀποκολλήθητι ἀπὸ τοῦ τόπου σου καὶ ἐλθὲ κάτω .
Matched sentence: λέγω γὰρ ὑμῖν ὅτι καλλίονά εἰσι τὰ ἱερὰ τῆς συναγωγής ὑμῶν .
Matched sentence: καθαρίζουσιν ἑαυτοὺς ἡμέρας ἑπτὰ διὰ τὸν φόβον .
Matched sentence: αἴρετε τὸν νόμον τοῦ θεοῦ καὶ εἰσέρχεσθε ὡς τὴν συναγωγὴν τοῦ θεοῦ καὶ καθαρίζετε καὶ ἀναγινώσκετε καὶ οὐκ εὐλαβεῖσθε τοὺς λόγους τοὺς ἐνδόξους τοῦ θεοῦ .
Matched sentence: ἐγινώσκετε τὴν πλάνην αὐτοῦ .
Matched sentence: ἄπελθε εἰς γῆν τῶν χαναναίων καὶ ἄπελθε εἰς τὸ σπηλαῖον τὸ διπλοῦν εἰς τὸν ἀγρὸν μαμβρή .
Matched sentence: ἀνάστηθι σὺ καὶ ὁ υἱός σου ἰσαὰκ καὶ ὁ υἱὸς τοῦ υἱ

In [15]:
detect_verb_last_position_[text_name] = detect_verb_last_position(non_lemmatized_content[text_name], normalized=True)

Matched sentence: αἰδεσθῶμεν .
Matched sentence: ὑπὸ τίνος ἠγάπηται .
Matched sentence: ὑπὸ τίνος ἠλέηται .
Matched sentence: διὰ τοῦτο ἀπὸ τοῦ χείρονος φεύγομεν .
Matched sentence: διὰ οὗ τὸ ἄδικον ῥίπτομεν .
Matched sentence: διὰ οὗ τὸν ἀνελεήμονα ἀφίεμεν .
Matched sentence: διὰ οὗ τὸν ἀπολλύντα ἐγνωρίσαμεν .
Matched sentence: διὰ οὗ τὸ σκότος ἐρρίψαμεν .
Matched sentence: διὰ οὗ τὰ πολλὰ ἀπεστράμμεθα .
Matched sentence: διὰ οὗ τὰ ἐπίγεια ἐμάθομεν .
Matched sentence: διὰ οὗ τὰ μένοντα εἴδομεν .
Matched sentence: καὶ ὥσπερ τις ἐμμανὴς γενόμενος ἀφίησιν ἣν ἐν χερσὶν δίκην ἔσχεν .
Matched sentence: καὶ εἰσελθὼν πρὸς αὐτὴν ἔλεγεν .
Matched sentence: καὶ σὲ εὖ κατὰ πάντα ποιήσαιμι .
Matched sentence: ἐπεὶ μᾶλλον καὶ ὃν ἔχω ἐν τῷ δεσμωτηρίῳ ξένον ἀπολύσω .
Matched sentence: εἰ δὲ μὴ βούλει .
Matched sentence: σοὶ μὲν χαλεπὸν οὐδὲν ἀγάγοιμι .
Matched sentence: οὐδὲ γὰρ δύναμαι .
Matched sentence: ἐκεῖνον δὲ ὃν μάλιστα ἐμοῦ στέργεις πλεῖον ἀνιάσω .
Matched sentence: πρὸς ὁπότερον τοιγαροῦν ὃ

In [16]:
detect_verb_last_position_normalized_[text_name] = detect_verb_last_position(non_lemmatized_content[text_name],normalized=True)

Matched sentence: αἰδεσθῶμεν .
Matched sentence: ὑπὸ τίνος ἠγάπηται .
Matched sentence: ὑπὸ τίνος ἠλέηται .
Matched sentence: διὰ τοῦτο ἀπὸ τοῦ χείρονος φεύγομεν .
Matched sentence: διὰ οὗ τὸ ἄδικον ῥίπτομεν .
Matched sentence: διὰ οὗ τὸν ἀνελεήμονα ἀφίεμεν .
Matched sentence: διὰ οὗ τὸν ἀπολλύντα ἐγνωρίσαμεν .
Matched sentence: διὰ οὗ τὸ σκότος ἐρρίψαμεν .
Matched sentence: διὰ οὗ τὰ πολλὰ ἀπεστράμμεθα .
Matched sentence: διὰ οὗ τὰ ἐπίγεια ἐμάθομεν .
Matched sentence: διὰ οὗ τὰ μένοντα εἴδομεν .
Matched sentence: καὶ ὥσπερ τις ἐμμανὴς γενόμενος ἀφίησιν ἣν ἐν χερσὶν δίκην ἔσχεν .
Matched sentence: καὶ εἰσελθὼν πρὸς αὐτὴν ἔλεγεν .
Matched sentence: καὶ σὲ εὖ κατὰ πάντα ποιήσαιμι .
Matched sentence: ἐπεὶ μᾶλλον καὶ ὃν ἔχω ἐν τῷ δεσμωτηρίῳ ξένον ἀπολύσω .
Matched sentence: εἰ δὲ μὴ βούλει .
Matched sentence: σοὶ μὲν χαλεπὸν οὐδὲν ἀγάγοιμι .
Matched sentence: οὐδὲ γὰρ δύναμαι .
Matched sentence: ἐκεῖνον δὲ ὃν μάλιστα ἐμοῦ στέργεις πλεῖον ἀνιάσω .
Matched sentence: πρὸς ὁπότερον τοιγαροῦν ὃ

In [17]:
import pandas as pd

# Convert each dictionary to a DataFrame
df_verb_first = pd.DataFrame.from_dict(detect_verb_first_position_, orient="index")
df_verb_first.columns = ['verb_first_position']

df_verb_first_norm = pd.DataFrame.from_dict(detect_verb_first_position_normalized_, orient="index")
df_verb_first_norm.columns = ['verb_first_position_normalized']

df_verb_last = pd.DataFrame.from_dict(detect_verb_last_position_, orient="index")
df_verb_last.columns = ['verb_last_position']

df_verb_last_norm = pd.DataFrame.from_dict(detect_verb_last_position_normalized_, orient="index")
df_verb_last_norm.columns = ['verb_last_position_normalized']

df_first_part = pd.DataFrame.from_dict(detect_verb_first_sentence_part_, orient="index")
df_first_part.columns = ['verb_first_sentence_part']

df_first_part_norm = pd.DataFrame.from_dict(detect_verb_first_sentence_part_normalized_, orient="index")
df_first_part_norm.columns = ['verb_first_sentence_part_normalized']

df_middle_part = pd.DataFrame.from_dict(detect_verb_middle_sentence_part_, orient="index")
df_middle_part.columns = ['verb_middle_sentence_part']

df_middle_part_norm = pd.DataFrame.from_dict(detect_verb_middle_sentence_part_normalized_, orient="index")
df_middle_part_norm.columns = ['verb_middle_sentence_part_normalized']

df_last_part = pd.DataFrame.from_dict(detect_verb_last_sentence_part_, orient="index")
df_last_part.columns = ['verb_last_sentence_part']

df_last_part_norm = pd.DataFrame.from_dict(detect_verb_last_sentence_part_normalized_, orient="index")
df_last_part_norm.columns = ['verb_last_sentence_part_normalized']

# Concatenate all the DataFrames on the index (text names)
result_df = pd.concat([
    df_verb_first,
    df_verb_first_norm,
    df_verb_last,
    df_verb_last_norm,
    df_first_part,
    df_first_part_norm,
    df_middle_part,
    df_middle_part_norm,
    df_last_part,
    df_last_part_norm
], axis=1)

# result_df now contains all your data


In [18]:
half_columns = result_df.columns[:round(len(result_df.columns)/2)]

print(result_df[half_columns].to_latex().replace("verb_", " ").replace("_", " "))

\begin{tabular}{lrrrrr}
\toprule
 &  first position &  first position normalized &  last position &  last position normalized &  first sentence part \\
\midrule
AndrMattTisch1115 & 14 & 12.844000 & 15.000000 & 13.761000 & 84 \\
AndrMattTisch & 73 & 10.443000 & 100.000000 & 14.306000 & 536 \\
MartAndrBonAlt & 17 & 12.143000 & 32.000000 & 22.857000 & 102 \\
MartAndrBonPrius & 45 & 16.423000 & 60.000000 & 21.898000 & 171 \\
PAndr2 & 40 & 9.877000 & 139.000000 & 34.321000 & 222 \\
AndrMattBon & 113 & 13.780000 & 114.000000 & 13.902000 & 639 \\
PAndr1 & 26 & 9.774000 & 109.000000 & 40.977000 & 146 \\
AndrMattBon1115 & 19 & 15.447000 & 20.000000 & 16.260000 & 95 \\
ActAndrPrieur & 126 & 12.831000 & 278.000000 & 28.310000 & 517 \\
MartAndrPrieurB & 43 & 13.651000 & 65.000000 & 20.635000 & 178 \\
MartAndrBonTert & 37 & 8.259000 & 98.000000 & 21.875000 & 286 \\
MartAndrPrieurA & 42 & 11.570000 & 93.000000 & 25.620000 & 191 \\
ActAndrBon & 44 & 11.733000 & 28.533000 & 28.533000 & 184 \\
\bottomr

In [19]:
half_columns = result_df.columns[round(len(result_df.columns)/2):]

print(result_df[half_columns].to_latex().replace("verb_", " ").replace("_", " "))

\begin{tabular}{lrrrrr}
\toprule
 &  first sentence part normalized &  middle sentence part &  middle sentence part normalized &  last sentence part &  last sentence part normalized \\
\midrule
AndrMattTisch1115 & 77.064000 & 65 & 59.633000 & 31 & 28.440000 \\
AndrMattTisch & 76.681000 & 441 & 63.090000 & 240 & 34.335000 \\
MartAndrBonAlt & 72.857000 & 91 & 65.000000 & 68 & 48.571000 \\
MartAndrBonPrius & 62.409000 & 145 & 52.920000 & 115 & 41.971000 \\
PAndr2 & 54.815000 & 247 & 60.988000 & 225 & 55.556000 \\
AndrMattBon & 77.927000 & 495 & 60.366000 & 266 & 32.439000 \\
PAndr1 & 54.887000 & 175 & 65.789000 & 138 & 51.880000 \\
AndrMattBon1115 & 77.236000 & 73 & 59.350000 & 33 & 26.829000 \\
ActAndrPrieur & 52.648000 & 565 & 57.536000 & 427 & 43.483000 \\
MartAndrPrieurB & 56.508000 & 151 & 47.937000 & 118 & 37.460000 \\
MartAndrBonTert & 63.839000 & 291 & 64.955000 & 225 & 50.223000 \\
MartAndrPrieurA & 52.617000 & 200 & 55.096000 & 149 & 41.047000 \\
ActAndrBon & 49.067000 & 214 & 5

In [38]:
detect_absolute_genetives_ = {}
detect_absolute_genetives_normalized_ = {}


for text_name, text in non_lemmatized_content.items():
    print("============")
    print(text_name)
    detect_absolute_genetives_[text_name] = detect_genitive_absolute(text, normalized=False)
    detect_absolute_genetives_normalized_[text_name] = detect_genitive_absolute(text, normalized=True)

df_absolute_genetive = pd.DataFrame.from_dict(detect_absolute_genetives_, orient="index")
df_absolute_genetive.columns = ['absolute_genetive_count']

df_absolute_genetive_normalized = pd.DataFrame.from_dict(detect_absolute_genetives_normalized_, orient="index")
df_absolute_genetive_normalized.columns = ["ratio_genetive_count"]


absolute_genetive = pd.concat([df_absolute_genetive, df_absolute_genetive_normalized], axis=1)

print(absolute_genetive.to_latex().replace("_", " "))

AndrMattTisch1115
[VERB + NOUN] λεγομένου + ἰησοῦ in: μαθητὰ τοῦ λεγομένου ἰησοῦ .
[VERB + PRON] πορευομένων + ἡμῶν in: εγένετο πορευομένων ἡμῶν τῶν δώδεκα μαθητῶν μετὰ τοῦ κυρίου ἡμῶν εἰς ἱερὸν τῶν ἐθνῶν ἵνα γνωρίσῃ ἡμῖν τὴν ἄγνοιαν τοῦ διαβόλου .
[VERB + NOUN] λεγομένου + ἰησοῦ in: μαθητὰ τοῦ λεγομένου ἰησοῦ .
[VERB + PRON] πορευομένων + ἡμῶν in: εγένετο πορευομένων ἡμῶν τῶν δώδεκα μαθητῶν μετὰ τοῦ κυρίου ἡμῶν εἰς ἱερὸν τῶν ἐθνῶν ἵνα γνωρίσῃ ἡμῖν τὴν ἄγνοιαν τοῦ διαβόλου .
AndrMattTisch
[VERB + NOUN] λεγομένου + ἰησοῦ in: ἀληθῶς γὰρ βούλομαι ὑμᾶς τοὺς μαθητὰς τοῦ λεγομένου ἰησοῦ ἀνελθεῖν ἐν τῷ πλοίῳ μου ἢ τοὺς παρέχοντάς μοι χρυσίου καὶ ἀργυρίου .
[VERB + NOUN] λεγομένου + ἰησοῦ in: αποκριθεὶς δὲ ὁ ἰησοῦς εἶπεν τῷ ᾿ανδρέᾳ εἰ ἀληθῶς μαθητὴς εἶ τοῦ λεγομένου ἰησοῦ .
[DET + NOUN + VERB] τῆς + θαλάσσης, κυμαινούσης in: καὶ ἀνέμου μεγάλου γενομένου καὶ τῆς θαλάσσης κυμαινούσης .
[VERB + NOUN] θαλάσσης + κυμαινούσης in: καὶ ἀνέμου μεγάλου γενομένου καὶ τῆς θαλάσσης κυμαινούσης .
[VERB + PR

In [56]:
detect_subordinate_verbs_ = {}
detect_subordinate_verbs_normalized_ = {}


for text_name, text in non_lemmatized_content.items():
    print("============")
    print(text_name)
    detect_subordinate_verbs_[text_name] = detect_subordinate_verbs(text, normalized=False)
    detect_subordinate_verbs_normalized_[text_name] = detect_subordinate_verbs(text, normalized=True)


df_subordinate_verbs = pd.DataFrame.from_dict(detect_subordinate_verbs_, orient="index")
df_subordinate_verbs.columns = ['subordinate_verbs_count']

df_subordinate_verbs_normalized = pd.DataFrame.from_dict(detect_subordinate_verbs_normalized_, orient="index")
df_subordinate_verbs_normalized.columns = ["subordinate_verbs_ratio"]

subordinate_verbs = pd.concat([df_subordinate_verbs, df_subordinate_verbs_normalized], axis=1)

print(subordinate_verbs.to_latex().replace("_", " "))

AndrMattTisch1115
Sent with subordinate verbs: ὑμεῖς δὲ ἐὰν πορνεύσητε .
AndrMattTisch
Sent with subordinate verbs: άλλὰ εἰ δύνασαι .
MartAndrBonAlt
Sent with subordinate verbs: ἐπεὶ εἶχεν .
MartAndrBonPrius

PAndr2
Sent with subordinate verbs: καὶ μᾶλλον ὤφειλον εὔξασθαι ἐντυχεῖν τῷ τροπαίῳ τοῦ σταυροῦ ἤπερ δειλιᾶσαι .
Sent with subordinate verbs: ἐπεὶ εἰ εἶχεν .
AndrMattBon
Sent with subordinate verbs: άλλὰ εἰ δύνασαι .
Sent with subordinate verbs: ἀλλὰ ταῦτά σοι ἐποίησα ὅτι εἶπας .
PAndr1

AndrMattBon1115
Sent with subordinate verbs: ὑμεῖς δὲ ἐὰν πορνεύσητε .
ActAndrPrieur
Sent with subordinate verbs: τὰ δὲ ὅλα ὅπως εἰδείης .
Sent with subordinate verbs: καὶ εἰποῦσα ταῦτα προσεδόκα λοιπὸν λύχνους ἁφθήσεσθαι ὅπως ἐξέλθοι .
Sent with subordinate verbs: ’ καὶ ἑτέροις δὲ ἐκέλευσε τέσσαρσι περὶ τὸν κοιτῶνα αὐτῆς γενέσθαι καὶ ἐπιτηρεῖν εἰ ἐξέρχοιτο .
Sent with subordinate verbs: ἐγὼ μὲν οὖν ταῦτα εἰπὼν ὡς εἶπον .
Sent with subordinate verbs: ὅπως διατεθῇς .
MartAndrPrieurB

MartAndrBonTer

In [None]:
detect_subordinate_verbs_ = {}
detect_subordinate_verbs_normalized_ = {}


for text_name, text in non_lemmatized_content.items():
    print("============")
    print(text_name)
    detect_subordinate_verbs_[text_name] = detect_subordinate_verbs(text, normalized=False)
    detect_subordinate_verbs_normalized_[text_name] = detect_subordinate_verbs(text, normalized=True)


df_subordinate_verbs = pd.DataFrame.from_dict(detect_subordinate_verbs_, orient="index")
df_subordinate_verbs.columns = ['subordinate_verbs_count']

df_subordinate_verbs_normalized = pd.DataFrame.from_dict(detect_subordinate_verbs_normalized_, orient="index")
df_subordinate_verbs_normalized.columns = ["subordinate_verbs_ratio"]

subordinate_verbs = pd.concat([df_subordinate_verbs, df_subordinate_verbs_normalized], axis=1)

print(subordinate_verbs.to_latex().replace("_", " "))

In [58]:
detect_main_verbs_ = {}
detect_main_verbs_normalized_ = {}


for text_name, text in non_lemmatized_content.items():
    print("============")
    print(text_name)
    detect_main_verbs_[text_name] = detect_main_verb(text, normalized=False)
    detect_main_verbs_normalized_[text_name] = detect_main_verb(text, normalized=True)


df_main_verbs = pd.DataFrame.from_dict(detect_main_verbs_, orient="index")
df_main_verbs.columns = ['main_verbs_count']

df_main_verbs_normalized = pd.DataFrame.from_dict(detect_main_verbs_normalized_, orient="index")
df_main_verbs_normalized.columns = ["subordinate_verbs_ratio"]

main_verbs = pd.concat([df_main_verbs, df_main_verbs_normalized], axis=1)

print(main_verbs.to_latex().replace("_", " "))

AndrMattTisch1115


TypeError: sequence item 0: expected str instance, spacy.tokens.token.Token found