In [2]:
# Standard library imports
import os
import string
import unicodedata

# SpaCy for NLP tasks
import spacy
from spacy.tokens import Doc, DocBin
from spacy.tokens import Doc, MorphAnalysis

# Parsing CoNLL-U formatted files
from conllu import parse

from pathlib import Path
from typing import List, Tuple, Dict

from sklearn.model_selection import train_test_split


In [3]:
model_name = "grc_proiel_trf"

# Check if the model is available and load it
if model_name in spacy.info()['pipelines']:
    nlp = spacy.load(model_name)
else:
    raise ImportError(f"The SpaCy model '{model_name}' is not installed.\n"
                      "Please install it using 'python -m spacy download {model_name}'.")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# Load CoNLL-U Data

In this section, we will load and process text data in the CoNLL-U format. CoNLL-U is a standard for annotating text data with linguistic information, such as part-of-speech tags, syntactic dependencies, and morphological features. Our data represents annotated anatomical ancient texts, which is crucial for creating a model that performs well on anatomical texts.

In [4]:
import unicodedata
import string
from conllu import parse
from typing import List
from sklearn.model_selection import train_test_split
from conllu import parse_incr
from io import StringIO

def remove_accents_to_lowercase(text: str) -> str:
    """
    Cleans the given text by removing diacritics (accents), except for specific characters,
    and converting it to lowercase.
    """
    allowed_characters = [' ̓', "᾿", "᾽", "'", "’", "‘", 'ʼ']  # Including the Greek apostrophe
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    try:
        non_accent_chars = [c for c in unicodedata.normalize('NFKD', text) 
        if unicodedata.category(c) != 'Mn' or c in allowed_characters]
    # Use str.lower() for converting to lowercase, which works for Unicode characters
        return ''.join(non_accent_chars).lower()
    
    except Exception as e:
        # A more generic exception handling if unexpected errors occur
        print(f"An error occurred: {e}")
        return text
    

def normalize_optional_remove_accents(text: str, norm_form: str, apply_cleaning: bool = False) -> str:
    """
    Normalize and optionally clean the given text using specified Unicode normalization form.
    """    
    normalized_text = unicodedata.normalize(norm_form, text)
    if apply_cleaning:
        # Implement the remove_accents_to_lowercase logic to remove unwanted characters or clean text
        text = remove_accents_to_lowercase(normalized_text) # Clean the text
    return text

def uniform_apostrophe(token, apostrophes=[' ̓', "᾿", "᾽", "'", "’", "‘"]):
    """Replace specified apostrophes with a uniform representation."""
    return 'ʼ' if token in apostrophes else token

def process_sentences(sentences_data, nlp, file_name, debug=False):
    """Converts sentences into spaCy's Doc objects."""
    #print(sentences_data) if debug else None
    words = [uniform_apostrophe(t['form']) for t in sentences_data]
    spaces = [not (t['misc'] and t['misc'].get('SpaceAfter') == 'No') for t in sentences_data]
    doc = Doc(nlp.vocab, words=words, spaces=spaces)
    set_token_attributes(doc, sentences_data)
    doc.user_data["source_info"] = file_name
    #print(doc, spaces) if debug else None 
    return doc

def set_token_attributes(doc, sentence, debug=False):
    """Set attributes like POS tags and lemmas for each token in the doc."""
    root_token = None  # Initialize root token
    
    for i, token in enumerate(doc):
        t = sentence[i]
        if t['form'] in string.punctuation:
            doc[i].pos_ = 'PUNCT'
        else:
            doc[i].pos_ = '' if t['upos'] == '_' else t['upos']
        doc[i].lemma_ = '' if t['lemma'] in ['', '_', '—', '-'] else t['lemma']
        # Tagger adjustment - POS tags
        if t['xpos']:
            if t['xpos'] in ['', '_', '—', '-', 'X', 'END', 'MID']:
                doc[i].tag_ = '' # Setting an empty string for the POS tag
        if t['upos'] in ['', '_', '—', '-', 'X', 'END', 'MID']:
            doc[i].pos_ = '' # Setting an empty string for the POS tag
        else:
            doc[i].pos_ = t['upos']
        # Morphological adjustment - morph tags
        if t['feats']:
            morph_analysis_hash = nlp.vocab.morphology.add(t['feats'])
            morph_analysis = MorphAnalysis.from_id(nlp.vocab, morph_analysis_hash)
            doc[i].morph = morph_analysis
        # Dependency parser adjustment, performed during Doc creation based on 'dep' and 'head' relation
        if t['deprel'] in ['', '_', '—', '-']:
            doc[i].dep_ = 'None'
        # Dependency parser adjustment, performed during Doc creation based on 'dep' and 'head' relation
        if t['deprel'] in ['', '_', '—', '-']:
            doc[i].dep_ = 'None'
        else:
            doc[i].dep_ = t['deprel']
            print("dep: ", doc[i].dep_) if debug else None
        # Identify the root token for later use
        if t['deprel'] == 'root':
            print("root token: ", root_token) if debug else None
            root_token = doc[i]
        else:
            print ("root token not found") if debug else None
    print("roooot: ", root_token if root_token else "elsee None") if debug else None
    
    # Once all other attributes are set, adjust the head
    for i, token in enumerate(doc):
        t = sentence[i]
        # Adjust head - handle case when head is specified
        if t['head'] not in [None, '', '_', '—', '-']: # Check if head is specified
            print("head: ", t['head']) if debug else None # Print head index
            head_idx = int(t['head']) - 1  # Adjust index if necessary
            if 0 <= head_idx < len(doc): # Ensure head index is within bounds
                print("0 < head_idx < len(doc): ", head_idx) if debug else None # Print head index
                doc[i].head = doc[head_idx] # Assign head token
                doc[i].head.pos_ = doc[head_idx].pos_ if doc[head_idx].pos_ else ''
                print("0 < : ", doc[i].head) if debug else None
        else:
            # Optionally, assign the root token as head if needed
            if root_token and root_token != token: # Check if root token is available and not the current token
                doc[i].head = root_token # Assign root token as head 
                print("doc i is root token: ", root_token) if debug else None
                
        #print(f"Token: {doc[i].text}, POS: {doc[i].pos_}, TAG: {doc[i].tag_}, LEMMA: {doc[i].lemma_}, DEP: {doc[i].dep_}, HEAD: {doc[i].head}") if debug else None

def clean_and_print_docs(docs, debug=False):
    """Clean and print docs for review."""
    for doc in docs:
        original_sentence = ' '.join(str(doc).split()) # Remove extra spaces
        if debug:
            print(f"Original sentence: {original_sentence}") 
        cleaned_sentence = ' '.join(str(doc).replace('\r', ' ').replace('\n', ' ').split())
        if debug:
            print(f"Cleaned sentence: {cleaned_sentence}")
            if original_sentence != cleaned_sentence:
                print(f"Original sentence: {original_sentence}")
                print(f"Cleaned sentence: {cleaned_sentence}")
            else:
                print(f"Clean sentence: {cleaned_sentence}")
        #for token in doc:
        #    print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head)

def serialize_docs(docs: List[Doc], norm_form: str, output_dir: Path, store_user_data: bool = True, debug: bool = False, output_file_name: str = None):
    output_dir.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists

    # Split documents to subsets
    if docs:  # Check that docs is non-empty to avoid the error
        train_docs, test_docs = train_test_split(docs, test_size=0.2, random_state=42)
        train_docs, dev_docs = train_test_split(train_docs, test_size=0.25, random_state=42)  # Results in 0.2 split for dev
    else:
        
        print(f"No documents found for normalization form {norm_form}. Skipping serialization.")
        return  # Early return if docs is empty to avoid proceeding with undefined variables

    
    subsets = {'train': train_docs, 'dev': dev_docs, 'test': test_docs}
    
    for subset_name, subset_docs in subsets.items():
        if output_file_name is not None:
            output_path = output_dir / f"{subset_name}/pos_{subset_name}/{output_file_name}_{subset_name}_{norm_form}.spacy"
        else:
            output_path = output_dir / f"{subset_name}/pos_{subset_name}/pos_{subset_name}_{norm_form}.spacy"
        doc_bin = DocBin(docs=subset_docs, store_user_data=store_user_data)
        doc_bin.to_disk(output_path)
        print(f"Saved {len(subset_docs)} docs for normalization form {norm_form} to {output_path}") # Progress indicator

def process_conllu_file(file_path, nlp, normalization_forms, docs_by_norm, apply_cleaning=False, debug=False):
    file_name = file_path.stem
    print(f"Processing file {file_name}...")  # Progress indicator

    with open(file_path, 'r', encoding='utf-8') as f:
        raw_data = f.read()

    # Process each normalization form separately
    for norm in normalization_forms:
        normalized_data = normalize_optional_remove_accents(raw_data, norm, apply_cleaning)
        print(normalized_data) if debug else None
        # Directly parse the normalized .conllu content into sentences
        sentences = parse(normalized_data)
        for sentence_data in sentences:
        # Process sentences and create doc objects
            doc = process_sentences(sentence_data, nlp, debug=debug, file_name=file_name)  # Assuming this function is defined correctly
            doc.user_data["source_info"] = file_name
            docs_by_norm[norm].extend([doc])  # Correctly using extend to avoid list of lists

def process_folder_and_serialize(input_path, nlp, normalization_forms, output_dir, apply_cleaning=False, debug: bool = False, output_file_name: str = None):
    docs_by_norm = {norm: [] for norm in normalization_forms}

# Assume process_conllu_file updates docs_by_norm with processed docs for each conllu file

    for file_path in Path(input_path).glob("*.conllu"):
        print("file name: ",file_path.name)  # Progress indicator
        process_conllu_file(file_path, nlp, normalization_forms, docs_by_norm, apply_cleaning, debug=debug)
    
    # Serialize once all documents for a normalization form have been accumulated
    for norm, docs in docs_by_norm.items():
        print(f"Serializing {len(docs)} documents for normalization form {norm} to {output_dir}... for type {type(docs)}")  # Progress indicator
        serialize_docs(docs, norm, output_dir, debug=debug, output_file_name=output_file_name)


In [5]:
process_folder_and_serialize(Path("../assets/INCEpTION_Conllu/"), nlp, ['NFKD', 'NFKC'], Path("../corpus/"), apply_cleaning=False, debug=False, output_file_name="pos")

file name:  Aristotle_Historia_Animalium_510b5-20.conllu
Processing file Aristotle_Historia_Animalium_510b5-20...
file name:  On Anatomy (2).conllu
Processing file On Anatomy (2)...
file name:  HA 519 Bladder.conllu
Processing file HA 519 Bladder...
file name:  Hippocrates epidemics 2.4.1 pt.1.conllu
Processing file Hippocrates epidemics 2.4.1 pt.1...
file name:  hippocrates places in man 6.9-10.conllu
Processing file hippocrates places in man 6.9-10...
file name:  Aristotle, History of Animals, HA 1.17 p. 85 Balme (496b 29-34).conllu
Processing file Aristotle, History of Animals, HA 1.17 p. 85 Balme (496b 29-34)...
file name:  hippocrates places in man 4.2-5.1.conllu
Processing file hippocrates places in man 4.2-5.1...
file name:  galen ΑΑ ΙΙ 4 Andrés .conllu
Processing file galen ΑΑ ΙΙ 4 Andrés ...
file name:  Aristotle_Historia_Animalium_511a22-34.conllu
Processing file Aristotle_Historia_Animalium_511a22-34...
file name:  hippocrates places in man 2.3-3.1.conllu
Processing file hip

In [6]:
process_folder_and_serialize(Path("../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/"), nlp, ['NFKD', 'NFKC'], Path("../corpus/"), output_file_name="UD_Ancient_Greek-Perseus", apply_cleaning=False, debug=False)

file name:  grc_perseus-ud-test_NFKD.conllu
Processing file grc_perseus-ud-test_NFKD...
file name:  grc_perseus-ud-dev_NFKD.conllu
Processing file grc_perseus-ud-dev_NFKD...
file name:  grc_perseus-ud-train_NFKD.conllu
Processing file grc_perseus-ud-train_NFKD...
Serializing 13919 documents for normalization form NFKD to ../corpus... for type <class 'list'>
Saved 8351 docs for normalization form NFKD to ../corpus/train/pos_train/UD_Ancient_Greek-Perseus_train_NFKD.spacy
Saved 2784 docs for normalization form NFKD to ../corpus/dev/pos_dev/UD_Ancient_Greek-Perseus_dev_NFKD.spacy
Saved 2784 docs for normalization form NFKD to ../corpus/test/pos_test/UD_Ancient_Greek-Perseus_test_NFKD.spacy
Serializing 13919 documents for normalization form NFKC to ../corpus... for type <class 'list'>
Saved 8351 docs for normalization form NFKC to ../corpus/train/pos_train/UD_Ancient_Greek-Perseus_train_NFKC.spacy
Saved 2784 docs for normalization form NFKC to ../corpus/dev/pos_dev/UD_Ancient_Greek-Perseus

In [7]:
process_folder_and_serialize(Path("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/"), nlp, ['NFKD', 'NFKC'], Path("../corpus/"), output_file_name="UD_Ancient_Greek-PROIEL", apply_cleaning=False, debug=False)

file name:  grc_proiel-ud-train_NFKD.conllu
Processing file grc_proiel-ud-train_NFKD...
file name:  grc_proiel-ud-test_NFKD.conllu
Processing file grc_proiel-ud-test_NFKD...
file name:  grc_proiel-ud-dev_NFKD.conllu
Processing file grc_proiel-ud-dev_NFKD...
Serializing 17080 documents for normalization form NFKD to ../corpus... for type <class 'list'>
Saved 10248 docs for normalization form NFKD to ../corpus/train/pos_train/UD_Ancient_Greek-PROIEL_train_NFKD.spacy
Saved 3416 docs for normalization form NFKD to ../corpus/dev/pos_dev/UD_Ancient_Greek-PROIEL_dev_NFKD.spacy
Saved 3416 docs for normalization form NFKD to ../corpus/test/pos_test/UD_Ancient_Greek-PROIEL_test_NFKD.spacy
Serializing 17080 documents for normalization form NFKC to ../corpus... for type <class 'list'>
Saved 10248 docs for normalization form NFKC to ../corpus/train/pos_train/UD_Ancient_Greek-PROIEL_train_NFKC.spacy
Saved 3416 docs for normalization form NFKC to ../corpus/dev/pos_dev/UD_Ancient_Greek-PROIEL_dev_NFKC

## Tests

In [None]:
import os
from pathlib import Path

# load and compare the documents
# Define the paths to the files
path_to_train_NFKD = Path("../corpus/train/pos_train/pos_train_NFKD.spacy")
path_to_train_NFKC = Path("../corpus/train/pos_train/pos_train_NFKC.spacy")
path_to_dev_NFKD = Path("../corpus/dev/pos_dev/pos_dev_NFKD.spacy")
path_to_dev_NFKC = Path("../corpus/dev/pos_dev/pos_dev_NFKC.spacy")
path_to_test_NFKD = Path("../corpus/test/pos_test/pos_test_NFKD.spacy")
path_to_test_NFKC = Path("../corpus/test/pos_test/pos_test_NFKC.spacy")


# Load the documents
doc_bin_train_NFKD = DocBin().from_disk(path_to_train_NFKD)
doc_bin_train_NFKC = DocBin().from_disk(path_to_train_NFKC)
doc_bin_dev_NFKD = DocBin().from_disk(path_to_dev_NFKD)
doc_bin_dev_NFKC = DocBin().from_disk(path_to_dev_NFKC)
doc_bin_test_NFKD = DocBin().from_disk(path_to_test_NFKD)
doc_bin_test_NFKC = DocBin().from_disk(path_to_test_NFKC)
#
# Get the documents as a list
docs_train_NFKD = list(doc_bin_train_NFKD.get_docs(nlp.vocab))
docs_train_NFKC = list(doc_bin_train_NFKC.get_docs(nlp.vocab))
docs_dev_NFKD = list(doc_bin_dev_NFKD.get_docs(nlp.vocab))
docs_dev_NFKC = list(doc_bin_dev_NFKC.get_docs(nlp.vocab))
docs_test_NFKD = list(doc_bin_test_NFKD.get_docs(nlp.vocab))
docs_test_NFKC = list(doc_bin_test_NFKC.get_docs(nlp.vocab))

# Load the documents
#docs = {name: load_docs(path, nlp) for name, path in paths.items()}

# Now you can compare the documents
# For example, you can print the first document from each list to see if they are the same
#print(docs_NFKD[145].text == docs_NFKC[145].text)
#print(docs_NFKD[145].text)
# print attributes for tokens of text
#for token in docs_NFKD[145]:
#    print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head)

#print(docs_NFKC[145].text)
#for token in docs_NFKC[145]:
#    print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head)

# find in any of the files doc with specific text
#sample_text = "τα γαρ προ αὐτων και τα ἐτι παλαιτερα σαφως" # sample from persus
sample_text = "ταῦτα δὲ τὰ γεγραμμένα πάσιν ὁμοίως εἰσί, καὶ φλέβες αἱ γεγραμμένοι"
#sample_text = "πάσιν ὁμοίως εἰσίν, άλλα τε φλαβιά εἰσιν ἄλλοισιν, ἀλλ' οὐκ ἄξια λόγου"
#sample_text = "ταῦτα δὲ τὰ γεγραμμένα πάσιν ὁμοίως εἰσί"
#sample_text = "ταῦτα δὲ τὰ γεγραμμένα πάσιν ὁμοίως εἰσί"
#sample_text = "νόσημα"
#sample_text = "καὶ οἱ μὲν πλείονας ἔχοντες"


def find_text_in_docs(docs, doc_type):
    for doc in docs:
        if sample_text in doc.text:
            print(f"Found in {doc_type}")
            print(doc.text)
            print(doc.user_data["file_name"])
            for token in doc:
                print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ",token.head, "Head POS: ", token.head.pos_,
                      "Children: ",[child for child in token.children])
        else:
            #print(f"Not found in {doc_type}")
            continue  # stop after finding the first match


find_text_in_docs(docs_train_NFKD, "train NFKD")
find_text_in_docs(docs_train_NFKC, "train NFKC")
find_text_in_docs(docs_dev_NFKD, "dev NFKD")
find_text_in_docs(docs_dev_NFKC, "dev NFKC")
find_text_in_docs(docs_test_NFKD, "test NFKD")
find_text_in_docs(docs_test_NFKC, "test NFKC")

In [None]:
for doc in docs_test_NFKC:
    for token in doc:
        #if token.pos_ not '':
        if token.pos_ != '':
            print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head, "Head POS: ", token.head.pos_,
                "Children: ", [child for child in token.children]) 

In [None]:
# load and compare pos_train_NFKD and pos_train_NFKC
# Define the paths to the files
path_to_NFKD = Path("../corpus/train/pos_train/pos_train_NFKD.spacy")
path_to_NFKC = Path("../corpus/train/pos_train/pos_train_NFKC.spacy")

# Load the documents
doc_bin_NFKD = DocBin().from_disk(path_to_NFKD)
doc_bin_NFKC = DocBin().from_disk(path_to_NFKC)

# Get the documents as a list
docs_NFKD = list(doc_bin_NFKD.get_docs(nlp.vocab))
docs_NFKC = list(doc_bin_NFKC.get_docs(nlp.vocab))

# Now you can compare the documents
# For example, you can print the first document from each list to see if they are the same
print(docs_NFKD[145].text == docs_NFKC[145].text)
print(docs_NFKD[145].text)
# print attributes for tokens of text
for token in docs_NFKD[145]:
    print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head)

print(docs_NFKC[145].text)
for token in docs_NFKC[145]:
    print("token: ", token.text, "LEMMA: ", token.lemma_, "POS: ", token.pos_, "TAG: ", token.tag_, "DEP: ", token.dep_, "HEAD: ", token.head)
    



In [None]:
# print doc that contains the text ταῦτα δὲ τὰ γεγραμμένα πάσιν ὁμοίως εἰσί, καὶ φλέβες αἱ γεγραμμένοι
if any("ταῦτα δὲ τὰ γεγραμμένα πάσιν ὁμοίως εἰσί, καὶ φλέβες αἱ γεγραμμένοι" in doc.text for doc in docs_NFKD):
    print("Found the text in the documents.")

In [None]:
Cleaned sentence: οἱ δὲ ἐπὶ τοῦ ποδὸς οὐκ ἔχω φάναι διὰ τί παρώφθησαν ἐνίοις, καὶ μάλιστά γʼ ὅσοι τοὺς ἔνδον τῆς χειρὸς ἑπτὰ μύας ἐθεάσαντο.
Cleaned sentence: παρέλιπον μὲν γὰρ κἀκεῖ τοὺς ἐν τῷ βάθει κειμένους ἐπʼ αὐτοῖς τοῖς ὀστοῖς, ὡς ἔμπροσθεν εἶπον, οὐ μὴν τούς γε προφανεῖς τοὺς ζʼ.
Cleaned sentence: κατὰ δὲ τὸν πόδα τέτταρα γένη μυῶν εἰσιν, οὐχ, ὡς ἐν τῇ χειρί, δύο·
Cleaned sentence: τρία μὲν ἐν τοῖς κάτω τοῦ ποδός, ἓν δὲ <ἐν> τοῖς ἄνω κατὰ τοῦ ταρσοῦ τεταγμένον.

In [None]:
docs = main(data, nlp, "../corpus/", debug=False, normalization_forms=['NFKC'])

# get tokens of first sentence
print(docs[243])
tokens = [t for t in docs[24]]
for t in tokens:
    print(t.text, t.pos_, t.tag_, t.lemma_, t.whitespace_)


## Check Spacy docbin file

In [8]:
# load spacy object
# load docs from file
docs = DocBin().from_disk("../corpus/train/pos_train/UD_Ancient_Greek-PROIEL_train_NFKD.spacy")
#docs = DocBin().from_disk("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/grc_proiel-ud-test_NFKD.spacy")
test_docbin_docs = list(docs.get_docs(nlp.vocab))


In [9]:
for doc in test_docbin_docs:
    print (doc)

ἦ Ὀνήσιλος Γόργου μὲν τοῦ Σαλαμινίων βασιλέος ἀδελφεὸς νεώτερος Χέρσιος δὲ τοῦ Σιρώμου τοῦ Εὐέλθοντος παῖς 
νόμοισι δὲ χρέωνται τοιοῖσιδε 
ἀμὴν λέγω ὑμῖν ὅτι εἰσίν τινες τῶν ὧδε ἑστώτων οἵτινες οὐ μὴ γεύσωνται θανάτου ἕως ἂν ἴδωσιν τὸν υἱὸν τοῦ ἀνθρώπου ἐρχόμενον ἐν τῇ βασιλείᾳ αὐτοῦ 
πέποιθεν ἐπὶ τὸν θεόν 
ὁρᾶτε καὶ φυλάσσεσθε ἀπὸ πάσης πλεονεξίας ὅτι οὐκ ἐν τῷ περισσεύειν τινὶ ἡ ζωὴ αὐτοῦ ἐστιν ἐκ τῶν ὑπαρχόντων αὐτῷ 
οὐχ ὑμῶν ἐστιν γνῶναι χρόνους ἢ καιροὺς οὓς ὁ πατὴρ ἔθετο ἐν τῇ ἰδίᾳ ἐξουσίᾳ ἀλλὰ λήμψεσθε δύναμιν ἐπελθόντος τοῦ ἁγίου πνεύματος ἐφʼ ὑμᾶς καὶ ἔσεσθέ μου μάρτυρες ἔν τε Ἱερουσαλὴμ καὶ ἐν πάσῃ τῇ Ἰουδαίᾳ καὶ Σαμαρίᾳ καὶ ἕως ἐσχάτου τῆς γῆς 
σὺ εἶ ὁ Χριστὸς ὁ υἱὸς τοῦ θεοῦ τοῦ ζῶντος 
ἑπτά 
προσέχετε δὲ ἀπὸ τῶν ἀνθρώπων 
ἐστρατήγεε δὲ αὐτῶν Δημόφιλος Διαδρόμεω 
ἐπύθοντο Λακεδαιμόνι

In [10]:
import pandas as pd

# create list of rows
rows = []
for doc in test_docbin_docs:
    for token in doc:
        row = [token.orth_, token.lemma_, token.pos_, token.tag_, token.dep_, token.head.orth_, token.morph]
        rows.append(row)

# create dataframe
df = pd.DataFrame(rows, columns=["Orth", "Lemma", "POS", "Tag", "Dep", "Head", "Morph"])

# print dataframe
print(df)

                 Orth       Lemma    POS Tag        Dep        Head  \
0                 ἦ         ἦ    ADV           root         ἦ   
1          Ὀνήσιλος  Ὀνήσιλος  PROPN          nsubj         ἦ   
2             Γόργου     Γόργος  PROPN           nmod  ἀδελφεὸς   
3                μὲν        μέν    ADV      discourse  ἀδελφεὸς   
4                τοῦ          ὁ    DET            det   βασιλέος   
...               ...         ...    ...  ..        ...         ...   
128379           ἣν        ὅς   PRON            obj       ἔχω   
128380          ἔχω       ἔχω   VERB            acl    ἀγάπην   
128381  περισσοτέρως   περισσῶς    ADV         advmod      ὑμᾶς   
128382           εἰς        εἰς    ADP           case      ὑμᾶς   
128383         ὑμᾶς     ὑμεῖς   PRON            obl       ἔχω   

                                                    Morph  
0                                                      ()  
1                    (Case=

In [11]:
df

Unnamed: 0,Orth,Lemma,POS,Tag,Dep,Head,Morph
0,ἦ,ἦ,ADV,,root,ἦ,()
1,Ὀνήσιλος,Ὀνήσιλος,PROPN,,nsubj,ἦ,"(Case=Nom, Gender=Masc, Number=Sing)"
2,Γόργου,Γόργος,PROPN,,nmod,ἀδελφεὸς,"(Case=Gen, Gender=Masc, Number=Sing)"
3,μὲν,μέν,ADV,,discourse,ἀδελφεὸς,()
4,τοῦ,ὁ,DET,,det,βασιλέος,"(Case=Gen, Definite=Def, Gender=Masc, Number=S..."
...,...,...,...,...,...,...,...
128379,ἣν,ὅς,PRON,,obj,ἔχω,"(Case=Acc, Gender=Fem, Number=Sing, PronType=Rel)"
128380,ἔχω,ἔχω,VERB,,acl,ἀγάπην,"(Mood=Ind, Number=Sing, Person=1, Tense=Pres, ..."
128381,περισσοτέρως,περισσῶς,ADV,,advmod,ὑμᾶς,(Degree=Cmp)
128382,εἰς,εἰς,ADP,,case,ὑμᾶς,()


In [13]:
for token in test_docbin_docs[2]:
#print attributes
    print('text: ', token.text, 'lemma :', token.lemma_, 'POS: ', token.pos_, 'tag: ', token.tag_, 'DEP: ', token.dep_, 'HEAD: ', token.head, 'Morph: ', token.morph)

text:  ἀμὴν lemma : ἀμήν POS:  INTJ tag:   DEP:  vocative HEAD:  λέγω Morph:  
text:  λέγω lemma : λέγω POS:  VERB tag:   DEP:  root HEAD:  λέγω Morph:  Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act
text:  ὑμῖν lemma : ὑμεῖς POS:  PRON tag:   DEP:  iobj HEAD:  λέγω Morph:  Case=Dat|Gender=Fem,Masc|Number=Plur|Person=2|PronType=Prs
text:  ὅτι lemma : ὅτι POS:  SCONJ tag:   DEP:  mark HEAD:  γεύσωνται Morph:  
text:  εἰσίν lemma : εἰμί POS:  AUX tag:   DEP:  cop HEAD:  γεύσωνται Morph:  Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act
text:  τινες lemma : τίς POS:  ADJ tag:   DEP:  nsubj HEAD:  γεύσωνται Morph:  Case=Nom|Gender=Masc|Number=Plur
text:  τῶν lemma : ὁ POS:  DET tag:   DEP:  det HEAD:  ἑστώτων Morph:  Case=Gen|Definite=Def|Gender=Masc|Number=Plur|PronType=Dem
text:  ὧδε lemma : ὧδε POS:  ADV tag:   DEP:  advmod HEAD:  ἑστώτων Morph:  
text:  ἑστώτων lemma : ἵστημι POS:  VERB tag:   DEP:  nmod HEAD:  τινε

In [None]:
# print the first doc in spacy docbin docs
print(test_docbin_docs)

In [None]:
from conllu import parse

def print_feats_from_conllu(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()

    sentences = parse(data)

    for sentence in sentences:
        for token in sentence:
            feats = token['feats']
            if feats:
                print(f"Token: {token['form']}, Feats: {feats}")
            else:
                print(f"Token: {token['form']}, Feats: -")
        print()  # Print an empty line to separate sentences