In [77]:
import os
import re
from pathlib import Path

import spacy
import pandas as pd

In [78]:
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # Use your preferred model here


# Preprocess Text

In [79]:
import re
import unicodedata

# apostrophes and correct_apostrophe are defined as follows:
apostrophes = ["᾽", "᾿", "'", "’", "‘"]
correct_apostrophe = "ʼ"

def clean_and_remove_accents(text: str) -> str:
    """
    Cleans the given text by removing diacritics (accents), except for specific characters,
    and converting it to lowercase.
    """
    allowed_characters = [' ̓', "᾿", "᾽", "'", "’", "‘", 'ʼ', '̓']  # Including the Greek apostrophe
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    try:
        non_accent_chars = [c for c in unicodedata.normalize('NFKD', text) 
        if unicodedata.category(c) != 'Mn' or c in allowed_characters]
        return ''.join(non_accent_chars)
    
    except Exception as e:
        # A more generic exception handling if unexpected errors occur
        print(f"An error occurred: {e}")
        return text
    

def normalize_text(text: str, form: str = 'NFKD', 
                   remove_accents: bool = False, 
                   lowercase: bool = False, 
                   standardize_apostrophe: bool = True, 
                   remove_brackets: bool = False, 
                   remove_trailing_numbers: bool = False, 
                   remove_extra_spaces: bool = False, 
                   debug: bool = False) -> str:
    """
    Applies multiple text normalization and cleaning steps on the input text.

    Parameters:
    - text (str): The text to be normalized.
    - form (str): Unicode normalization form ('NFC', 'NFD', 'NFKC', 'NFKD').
    - lowercase (bool): If True, the text is converted to lowercase.
    - standardize_apostrophe (bool): If True, replaces all defined apostrophe characters with a standard one.
    - remove_brackets_only (bool): If True, removes the brackets themselves.
    - remove_trailing_numbers (bool): If True, strips leading or trailing digits from the text.
    
    Returns:
    - str: The processed text.
    """
    normalized_text = text  # Initialize normalized_text with the original text

    # Function to print before and after states for each operation during debugging
    def debug_print(operation_name, before, after):
        if debug:
            print(f"{operation_name} - Before: {before}")
            print(f"{operation_name} - After: {after}")

    # Standardize apostrophe characters if required
    if standardize_apostrophe:
        before_text = normalized_text
        for apos in apostrophes:
            normalized_text = normalized_text.replace(apos, correct_apostrophe)
        debug_print("Standardizing apostrophes", before_text, normalized_text)
        
    if remove_accents:
        before_text = normalized_text
        try:
            normalized_text = clean_and_remove_accents(normalized_text)
        except Exception as e:
            print(f"An error occurred while removing accents: {e}")
            # Decide what to do here: return the original text, a special value, or stop the process
            return text        
        debug_print("Removing accents", before_text, normalized_text)
        
    # Convert to lowercase if required
    if lowercase:
        before_text = normalized_text
        normalized_text = normalized_text.lower()
        debug_print("Lowercase conversion", before_text, normalized_text)

    # Unicode normalization
    if form:
        before_text = normalized_text
        normalized_text = unicodedata.normalize(form, normalized_text)
        debug_print("Unicode normalization", before_text, normalized_text)
            
    # Remove brackets only if required
    if remove_brackets:
        before_text = normalized_text
        normalized_text = re.sub(r'[\(\)\[\]]', '', normalized_text)
        debug_print("Removing brackets", before_text, normalized_text)
        
    # Remove trailing numbers if required
    if remove_trailing_numbers:
        before_text = normalized_text
        normalized_text = re.sub(r'^\d+|\d+$', '', normalized_text)
        debug_print("Removing trailing numbers", before_text, normalized_text)

    # Remove multiple spaces and leading/trailing spaces
    if remove_extra_spaces:
        before_text = normalized_text
        normalized_text = ' '.join(normalized_text.split()).strip()
        debug_print("Removing extra spaces", before_text, normalized_text)

    return normalized_text

In [80]:

# Load the dataset
FILE_PATH = "../assets/NER_assets/Ancient_Words_12_5_22.csv"
df = pd.read_csv(FILE_PATH)

# Renaming columns
df.rename(columns={'Word': 'Keyword', 'Category Types': 'Label'}, inplace=True)

# Filling NaN values more efficiently and appropriately
for early_col, new_col in [('Early Quote', 'Quote'), ('Early Word Before', 'Word Before'), 
                           ('Early Word After', 'Word After'), ('Early Category Type', 'Label')]:
    df[new_col].fillna(df[early_col], inplace=True)

# Dropping rows with no Keyword and non-Greek Keywords
pat = '[ء-ي]+'
df = df.dropna(subset=['Keyword']).copy()
df = df[~df['Keyword'].str.contains(pat, na=False)]

# Cleaning data with combined regex patterns

# Define a dictionary of patterns and replacements for the entire dataframe
df_replacements = {
    '\d+': '',  # Numbers
    '-': '',  # Hyphens
    ' +': ' ',  # Multiple spaces
}

# Apply the replacements to the text columns ('Early Quote', 'Quote', 'Early Word Before', 'Word Before', 'Early Word After', 'Word After', 'Keyword')
for col in ['Early Quote', 'Quote', 'Early Word Before', 'Word Before', 'Early Word After', 'Word After', 'Keyword']:
    for pattern, replacement in df_replacements.items():
        df[col].replace(pattern, replacement, regex=True, inplace=True)

# Define a dictionary of patterns and replacements for the 'Keyword' column
keyword_replacements = {
    '\n': '',  # New line
    ',': '',  # Comma
    '\.': '',  # Period
    '\·': '',  # Interpunkt
    '\s+$': ''  # End punctuation
}

# Apply the replacements to the 'Keyword' column
for pattern, replacement in keyword_replacements.items():
    df['Keyword'].replace(pattern, replacement, regex=True, inplace=True)
# Resetting the dataframe index
df.reset_index(drop=True, inplace=True)

# Normalizing the text in the columns
columns_to_normalize = ['Early Quote', 'Quote', 'Early Word Before', 'Word Before', 'Early Word After', 'Word After', 'Keyword']

for col in columns_to_normalize:
    if df[col].dtype == 'object':
        print(df[col].apply(lambda x: print(type(x))))
        df[col] = df[col].apply(lambda x: normalize_text(x, remove_accents=False, lowercase=False, standardize_apostrophe=True, remove_brackets=False, debug=True) if pd.notna(x) else x)

  '\d+': '',  # Numbers
  '\.': '',  # Period
  '\s+$': ''  # End punctuation
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[new_col].fillna(df[early_col], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace(pattern, replacement, regex=True, inplace=True)
The behavior will change in pandas 3.0. Th

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [81]:
df.head(10)
len(df)

1411

In [82]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", print the row and drop it
for i in range(len(df)):
    if df.iloc[i, 0] == "0" or df.iloc[i, 1] == "0" or df.iloc[i, 2] == "0" or df.iloc[i, 3] == "0":
        print(df.iloc[i])
        df.drop(i, inplace=True)

In [83]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.tokens import Doc, DocBin
from unicodedata import normalize
import random


# Create dictionaries from dendrosearch and conllu files (supplied by Jacobo)

In [84]:
# create the coda dictionary for word : lemma 

coda_lemma_dict = df.dropna(subset=['Keyword', 'Lemma']).set_index('Keyword')['Lemma'].to_dict()

In [85]:
# create the dendrosearch dictionary for word : lemma 

# punctuation to be removed
PUNCTUATION = set(['.', ")", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"])

dendrosearch_lemma_dict = {} 
with open('../assets/dendrosearch_lemma_dict.txt', 'r', encoding='utf-8') as f:
    dendrosearch_lemma_dict = {line.split()[0]: line.split()[1] for line in f if len(line.split()) > 1 and line.split()[0] not in PUNCTUATION} 
    # clean the dictionary with remove_accents_to_lowercase
    dendrosearch_lemma_dict = {normalize_text(k, remove_accents=False, lowercase=False, standardize_apostrophe=True): normalize_text(v, remove_accents=False, lowercase=True, standardize_apostrophe=True) for k, v in dendrosearch_lemma_dict.items()}

In [86]:
dendrosearch_lemma_dict

{'Θουκυδίδης': 'θουκυδίδης',
 'Ἀθηναῖος': 'ἀθηναῖος',
 'ξυνέγραψε': 'συγγράφω',
 'τὸν': 'ὁ',
 'πόλεμον': 'πόλεμος',
 'τῶν': 'ὁ',
 'Πελοποννησίων': 'πελοποννήσιος',
 'καὶ': 'καί',
 'Ἀθηναίων': 'ἀθηναῖος',
 'ὡς': 'ὡς',
 'ἐπολέμησαν': 'πολεμέω',
 'πρὸς': 'πρός',
 'ἀλλήλους': 'ἀλλήλων',
 'ἀρξάμενος': 'ἄρχω',
 'εὐθὺς': 'εὐθύς',
 'καθισταμένου': 'καθίστημι',
 'ἐλπίσας': 'ἐλπίζω',
 'μέγαν': 'μέγας',
 'τε': 'τε',
 'ἔσεσθαι': 'εἰμί',
 'ἀξιολογώτατον': 'ἀξιόλογος',
 'προγεγενημένων': 'προγίγνομαι',
 'τεκμαιρόμενος': 'τεκμαίρομαι',
 'ὅτι': 'ὅτι2',
 'ἀκμάζοντές': 'ἀκμάζω',
 'ᾖσαν': 'ἀείδω',
 'ἐς': 'εἰς',
 'αὐτὸν': 'αὐτός',
 'ἀμφότεροι': 'ἀμφότερος',
 'παρασκευῇ': 'παρασκευή',
 'τῇ': 'ὁ',
 'πάσῃ': 'πᾶς',
 'τὸ': 'ὁ',
 'ἄλλο': 'ἄλλος',
 'Ἑλληνικὸν': 'ἑλληνικός',
 'ὁρῶν': 'ὁράω',
 'ξυνιστάμενον': 'συνίστημι',
 'ἑκατέρους': 'ἑκάτερος',
 'μὲν': 'μέν',
 'εὐθύς': 'εὐθύς',
 'δὲ': 'δὲ',


In [87]:
# create the greCy conllu dictionary for word : lemma

import os
import glob  # Recommended for easy file pattern matching

PUNCTUATION = set(['.', ")", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"])  # Using a set for performance
input_path = "../assets/Lemmatization_training_files/Processed"
conllu_lemma_dict = {}

def process_conllu_file(file_path, nlp, debug=False):
    file_name = file_path.stem
    print(f"Processing file {file_name}...")  # Progress indicator
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if not line.strip() or line.startswith('#'):
                continue  # Skip empty lines and comments
                
            parts = line.split()
            if len(parts) > 2 and parts[1] not in PUNCTUATION:
                # Ensure at least id, keyword, and lemma are present and keyword isn't punctuation
                conllu_lemma_dict[parts[1]] = parts[2]
                
    # return the dictionary
    return conllu_lemma_dict    

# Utilize glob for cleaner file selection
for file_path in Path(input_path).glob("*.conllu"):
    #print("file name: ",file_path.name)  # Progress indicator
    process_conllu_file(file_path, nlp, debug=False)
print("Done")
    

Processing file euripides_NFKD...
Processing file plato_ii_NFKD...
Processing file thucydides_NFKD...
Processing file plato_iv_NFKD...
Processing file orators_NFKD...
Processing file herodotus_NFKD...
Processing file xenophon_iv_NFKD...
Processing file galen_NFKD...
Processing file aeschylus_NFKD...
Processing file homer_NFKD...
Processing file aristophanes_NFKD...
Processing file apollonius_NFKD...
Processing file arrian_NFKD...
Processing file sophocles_NFKD...
Processing file xenophon_iii_NFKD...
Processing file pausanias_NFKD...
Processing file pindar_NFKD...
Processing file plato_iii_NFKD...
Processing file lucian_NFKD...
Processing file xenophon_i_NFKD...
Processing file xenophon_ii_NFKD...
Processing file plutarch_NFKD...
Processing file aristotle_NFKD...
Done


## Create dictionary amd sentences data from INCEpTION files

In [88]:
from cassis import load_typesystem, load_cas_from_xmi
import zipfile
from pathlib import Path
import tempfile
from tqdm import tqdm

In [89]:
# Define paths using pathlib
inception_files_path = Path("../assets/NER_assets/INCEpTION_files/")
tempdir_path = Path(tempfile.mkdtemp())  # Create a temporary directory and get its path

try:
    # Extract all .zip files found in the inception files path
    for zip_file_path in inception_files_path.glob("*.zip"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(tempdir_path)

    typesystem_file_path = tempdir_path / "TypeSystem.xml"

    # Check for the existence of TypeSystem.xml before proceeding
    if not typesystem_file_path.exists():
        raise FileNotFoundError("TypeSystem.xml not found in the extracted files.")

    with open(typesystem_file_path, 'rb') as f:
        typesystem = load_typesystem(f)

    inception_dict = {}
    inception_sentences = []  # List of tuples (sentence, source_file)

    # Process each .xmi file found in the temporary directory
    for xmi_file_path in tempdir_path.glob("*.xmi"):
        with open(xmi_file_path, 'rb') as f:
            cas = load_cas_from_xmi(f, typesystem=typesystem, lenient=True)
            # Update inception_dict dictionary with lemmas
            inception_dict.update(
                {token.get_covered_text(): token.value for token in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma')}
            )
            # Extend inception_sentences list with sentences
            inception_sentences.extend(
                [(sentence.get_covered_text(), xmi_file_path.name) for sentence in cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")]
            )
    # Clean and normalize inception_sentences
    inception_sentences = [
        (normalize_text(' '.join(sentence[0].replace('\r', ' ').replace('\n', ' ').split()), remove_accents=False, lowercase=False, standardize_apostrophe=True), sentence[1])
        for sentence in inception_sentences
    ] 
    
    # Cleanup function added to ensure the temporary directory is always cleaned up
    def cleanup_tempdir(directory):
        for child in directory.iterdir():
            if child.is_file():
                child.unlink()
        directory.rmdir()
        
except FileNotFoundError as e:
    print(f"Error: {e}")
    cleanup_tempdir(tempdir_path)
    # Exit or raise the error for further handling depending on the script's usage
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    cleanup_tempdir(tempdir_path)
    # Exit or raise the error for further handling depending on the script's usage
else:
    # This block runs if no exceptions were raised
    # Print or process the inception_dict and inception_sentences as needed
    # Example: print some of the processed sentences
        print(f"\nFinito!")
finally:
    # print how many sentences were processed
    print(f"Processed {len(inception_sentences)} sentences.")
    # Ensure the temporary directory is always cleaned up
    cleanup_tempdir(tempdir_path)





Finito!
Processed 332 sentences.


In [90]:
inception_sentences

[('Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι·',
  'Aristotle_Partibus_Animalium_674a9-16.xmi'),
 ('τῆς δὲ κοιλίας ἐχόμενον τὸ καλούμενον ἔντερον.',
  'Aristotle_Partibus_Animalium_674a9-16.xmi'),
 ('Διʼ ἣν δʼ αἰτίαν ἔχει ταῦτα τὰ μόρια τῶν ζῴων ἕκαστον, φανερὸν πᾶσιν.',
  'Aristotle_Partibus_Animalium_674a9-16.xmi'),
 ('Καὶ γὰρ δέξασθαι τὴν εἰσελθοῦσαν τροφὴν καὶ τὴν ἐξικμασμένην ἀναγκαῖον ἐκπέμψαι, καὶ μὴ τὸν αὐτὸν τόπον εἶναι τῆς τε ἀπέπτου καὶ τοῦ περιττώματος, εἶναί τέ τινα δεῖ τόπον ἐν ᾧ μεταβάλλει.',
  'Aristotle_Partibus_Animalium_674a9-16.xmi'),
 ('πρὸς δὲ τὸ γόνυ τὸ ὀστέον τοῦ μηροῦ τοιόνδʼ ἐστὶ δίκραιον·',
  'hippocrates places in man 6.9-10.xmi'),
 ('τῷ δὲ δικραίῳ τούτῳ τὸ ὀστέον ἡ κνήμη καλεσμένη οἷον ἐν γιγγλύμῳ ἀνήρμοσται·',
  'hippocrates pl

## Create large dictionary from all sources

In [91]:
import json
import unicodedata
import re

big_dict = {
    'Conllu': conllu_lemma_dict,
    'Inception': inception_dict,
    'Coda': coda_lemma_dict,
    'Dendrosearch': dendrosearch_lemma_dict
}


# Placeholder structure for the processed pairs
processed_pairs_with_sources = {
    'NFKD': {},
    'NFKC': {}
}

for form in ['NFKD', 'NFKC']:
    for source, lemma_dict in big_dict.items():
        for word, lemma in lemma_dict.items():
            norm_word = normalize_text(word, form, remove_accents=False, lowercase=False, standardize_apostrophe=True, remove_brackets=True, remove_trailing_numbers=True, debug=False)
            norm_lemma = normalize_text(lemma, form, remove_accents=False, lowercase=True, standardize_apostrophe=True, remove_brackets=True, remove_trailing_numbers=True, debug=False)

            # Skip empty normalized word entries
            if not norm_word or norm_lemma in ["_", " ", ""]:  
                continue

            if norm_word not in processed_pairs_with_sources[form]:
                processed_pairs_with_sources[form][norm_word] = {}

            # Aggregate lemmas with their sources
            if norm_lemma in processed_pairs_with_sources[form][norm_word]:
                processed_pairs_with_sources[form][norm_word][norm_lemma].add(source)
            else:
                processed_pairs_with_sources[form][norm_word][norm_lemma] = {source}

In [92]:
# Process the pairs and choose the most common lemma for each word

import json
import random

processed_counter = 0  # Counter for processed words
no_lemma_counter = 0  # Counter for words with no lemma
deleted_counter = 0  # Counter for deleted words
for form, word_lemmas in processed_pairs_with_sources.items():
    keys_to_delete = []  # List to store keys of pairs to be deleted
    for word, lemmas in word_lemmas.items():
        most_common_lemma = None
        most_common_count = 0
        source_info_for_most_common = set()
        tied_lemmas = []  # For handling ties
        
        for lemma, sources in lemmas.items():
            current_count = len(sources)
            if current_count > most_common_count:
                most_common_lemma = lemma
                most_common_count = current_count
                source_info_for_most_common = sources
                tied_lemmas = [lemma]  # Reset ties because a new max is found
            elif current_count == most_common_count:
                tied_lemmas.append(lemma)  # Add lemma to ties
        
        # Handle ties: choose the lexically first lemma if there's a tie
        if len(tied_lemmas) > 1:
            tied_lemmas.sort()  # Sort the list to ensure consistent processing
            most_common_lemma = tied_lemmas[0]  # Lexically first lemma
            # Combine sources of tied lemmas since more than one had the "most common" status
            source_info_for_most_common = set().union(*(lemmas[lemma] for lemma in tied_lemmas))
            
        if most_common_lemma:
            processed_pairs_with_sources[form][word] = {most_common_lemma: source_info_for_most_common}
            processed_counter += 1
        else:
            keys_to_delete.append(word)
            no_lemma_counter += 1

    # Delete marked words outside the loop
    for word in keys_to_delete:
        del processed_pairs_with_sources[form][word]
        deleted_counter += 1

print(f"Total processed words: {processed_counter}")
print(f"Total words with no lemma: {no_lemma_counter}")
print(f"Total deleted words: {deleted_counter}")

# Function to save the processed_pairs_with_sources to a json file
#def save_processed_pairs(processed_pairs_with_sources, filename):
    # Convert sets to lists for JSON serialization
#    for form, word_lemmas in processed_pairs_with_sources.items():
#        for word, lemmas in word_lemmas.items():
#            for lemma, sources in lemmas.items():
#                lemmas[lemma] = list(sources)

    # Save to file
    #with open(filename, 'w', encoding='utf-8') as f:
    #    json.dump(processed_pairs_with_sources, f, ensure_ascii=False)

# Example usage of save function
#save_processed_pairs(processed_pairs_with_sources, 'processed_pairs_with_sources.json')

Total processed words: 426346
Total words with no lemma: 0
Total deleted words: 0


# Run NLP pipeline on INCEpTION and Coda files

In [93]:
import spacy
from collections import OrderedDict

# Assuming the model is correctly installed:
#nlp = spacy.load("grc_proiel_trf")  # Use your preferred model here

In [94]:
# Assuming 'inception_sentences' is a list of tuples like (sentence_text, source_file),
# and 'df' is your DataFrame containing Coda sentences.
sentences = OrderedDict()


# Add inception sentences with source file
for sentence, source_file in inception_sentences:
    if sentence not in sentences:
        sentences[sentence] = ('Inception', source_file)
    
# source for Coda is made from the values in columns:Modern Edition, Book, Chapter, Section, Page, Line Number

def format_value(col, value):
    if pd.isna(value) or value == '':
        return None
    if col == 'Modern Edition':
        return str(value)
    if isinstance(value, (int, float)):
        value = int(value)
    return f"{col[0]}({value})"

for row in df.index:
    formatted_values = []
    for col in ['Modern Edition', 'Book', 'Chapter', 'Section', 'Page', 'Line Number']:
        value = df.loc[row, col]
        formatted_value = format_value(col, value)
        if formatted_value is not None:
            formatted_values.append(formatted_value)

# Add Coda sentences from DataFrame
    sentence = df.loc[row, 'Quote']
    
    source = ', '.join(formatted_values)
    cleaned_sentence = normalize_text(sentence, standardize_apostrophe=True, remove_extra_spaces=True)
    if cleaned_sentence not in sentences:
        sentences[cleaned_sentence] = (f"Coda, {source}")

# Convert OrderedDict back to a list of tuples if needed for further processing (sentence, source_type, source_file)
sentences_list = list(sentences.items())

In [95]:
sentences_list

[('Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι·',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('τῆς δὲ κοιλίας ἐχόμενον τὸ καλούμενον ἔντερον.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('Διʼ ἣν δʼ αἰτίαν ἔχει ταῦτα τὰ μόρια τῶν ζῴων ἕκαστον, φανερὸν πᾶσιν.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('Καὶ γὰρ δέξασθαι τὴν εἰσελθοῦσαν τροφὴν καὶ τὴν ἐξικμασμένην ἀναγκαῖον ἐκπέμψαι, καὶ μὴ τὸν αὐτὸν τόπον εἶναι τῆς τε ἀπέπτου καὶ τοῦ περιττώματος, εἶναί τέ τινα δεῖ τόπον ἐν ᾧ μεταβάλλει.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('πρὸς δὲ τὸ γόνυ τὸ ὀστέον τοῦ μηροῦ τοιόνδʼ ἐστὶ δίκραιον·',
  ('Inception', 'hippocrates places in man 6.9-10.xmi')),
 ('τῷ δὲ δικραίῳ τούτῳ τὸ ὀστέον η

In [96]:
len(sentences_list)

550

In [97]:
random.sample(sentences_list, 50)

[('πλὴν οὐχ οὕτως ἀλλὰ κατὰ στενοτέραν σύριγγα πολλῷ κοινωνεῖ',
  'Coda, Aristotle, History of Animals, B(3), C(3), P(134), L(5)'),
 ('Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι',
  'Coda, Aristotle, Parts of Animals, B(3), C(14), P(99), L(9)'),
 ('ἀλλὰ πῶς ὕδωρ ἀναιδὲς ἐνοροῦον ὄχλον καὶ βῆχα παρέχει πολλήν; οὕνεκα, φημί, ἀπάντικρυ τῆς ἀναπνοῆς φέρεται.',
  ('Inception', 'Hippocratic_De corde Part A chs 1-4.xmi')),
 ('οὗτος μὲν οὖν ὁ μῦς εἰκότως ἠγνοήθη, κατακεκρυμμένης αὐτοῦ τῆς κεφαλῆς.',
  ('Inception', 'Gal AA, 2, 9, 116, 9-26 Garofalo, 325-326 Kühn.xmi')),
 ('ἤδη δʼ ὤφθη καὶ ἀνδρὸς κεφαλὴ οὐκ ἔχουσα ῥαφάς.',
  ('Inception', 'Aristotle, History of Animals, 3.7, p. 141-142 Balme.xmi')),
 ('Τούτῳ οὖν τῷ φλεβίῳ ἀνάγκη ἀσθενέστερον εἶναι καὶ ἐνδεέστερον τοῦτο τοῦ σώμα

In [98]:
processed_pairs_with_sources['NFKD']

{'ἥκω': {'ἥκω': {'Conllu', 'Dendrosearch'}},
 'Διὸς': {'ζεύς': {'Conllu', 'Dendrosearch'}},
 'παῖς': {'παῖς': {'Conllu', 'Dendrosearch'}},
 'τήνδε': {'ὅδε': {'Conllu', 'Dendrosearch', 'Inception'}},
 'Θηβαίων': {'θηβαῖος': {'Conllu', 'Dendrosearch'}},
 'χθόνα': {'χθών': {'Conllu', 'Dendrosearch'}},
 'Διόνυσος': {'διόνυσος': {'Conllu', 'Dendrosearch'}},
 'ὃν': {'ὅς': {'Conllu', 'Dendrosearch', 'Inception'}},
 'τίκτει': {'τίκτω': {'Conllu', 'Dendrosearch'}},
 'ποθʼ': {'ποτε': {'Conllu', 'Dendrosearch'}},
 'ἡ': {'ὁ': {'Coda', 'Conllu', 'Dendrosearch', 'Inception'}},
 'Κάδμου': {'κάδμος': {'Conllu', 'Dendrosearch'}},
 'κόρη': {'κόρη': {'Conllu', 'Dendrosearch'}},
 'Σεμέλη': {'σεμέλη': {'Conllu', 'Dendrosearch'}},
 'λοχευθεῖσʼ': {'λοχέυω': {'Conllu'}},
 'ἀστραπηφόρῳ': {'ἀστραπηφόρος': {'Conllu'}},
 'πυρί': {'πῦρ': {'Conllu', 'Dendrosearch'}},
 'μορφὴν': {'μορφή': {'Conllu', 'Dendrosearch'}},
 'δʼ': {'δέ': {'Coda', 'Conllu', 'Dendrosearch', 'Incepti

In [99]:
sentences_list

[('Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι·',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('τῆς δὲ κοιλίας ἐχόμενον τὸ καλούμενον ἔντερον.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('Διʼ ἣν δʼ αἰτίαν ἔχει ταῦτα τὰ μόρια τῶν ζῴων ἕκαστον, φανερὸν πᾶσιν.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('Καὶ γὰρ δέξασθαι τὴν εἰσελθοῦσαν τροφὴν καὶ τὴν ἐξικμασμένην ἀναγκαῖον ἐκπέμψαι, καὶ μὴ τὸν αὐτὸν τόπον εἶναι τῆς τε ἀπέπτου καὶ τοῦ περιττώματος, εἶναί τέ τινα δεῖ τόπον ἐν ᾧ μεταβάλλει.',
  ('Inception', 'Aristotle_Partibus_Animalium_674a9-16.xmi')),
 ('πρὸς δὲ τὸ γόνυ τὸ ὀστέον τοῦ μηροῦ τοιόνδʼ ἐστὶ δίκραιον·',
  ('Inception', 'hippocrates places in man 6.9-10.xmi')),
 ('τῷ δὲ δικραίῳ τούτῳ τὸ ὀστέον η

In [100]:
import pandas as pd
from tqdm.auto import tqdm
from collections import OrderedDict
from typing import List, Dict
from spacy.tokens import Doc
from spacy.language import Language

# Assuming 'processed_pairs_with_sources', 'nlp' (a SpaCy Language model), and 'sentences_list' are already defined

def process_sentences(sentences_list, nlp, processed_pairs, debug=False):
    docs_nfkd: List[Doc] = []
    docs_nfkc: List[Doc] = []

    corrections_nfkd = []
    corrections_nfkc = []

    sentences_and_metadata = [
        (normalize_text(sentence, form, remove_accents=False, lowercase=False, standardize_apostrophe=True, remove_brackets=False, remove_trailing_numbers=False, debug=debug), (source_info, form))
        for sentence, source_info in sentences_list
        for form in ['NFKD', 'NFKC']
    ]

    sentences_for_processing = [sentence for sentence, _ in sentences_and_metadata]
    metadata = [meta for _, meta in sentences_and_metadata]

    corrected_count = 0
    not_corrected_count = 0
    
    for doc, meta in tqdm(zip(nlp.pipe(sentences_for_processing, batch_size=1000), metadata), total=len(sentences_and_metadata)):
        form = meta[1]  # 'NFKD' or 'NFKC'   
        print(f"Processing sentence '{doc.text}'") if debug else None
        print(f"Form: {form}") if debug else None     
        # Determine the correct list to append corrections based on the form
        corrections_list = corrections_nfkd if form == 'NFKD' else corrections_nfkc
        doc.user_data["source_info"] = meta[0]
        print("userdata: ", meta[0]) if debug else None
        print(f"Added source info '{meta[0]}' to the Doc object: ", doc.user_data["source_info"]) if debug else None

        for token in doc:
            lemma_sources = None
            print(f"Processing token '{token.text}' in sentence '{doc.text}'") if debug else None
            if token.text in processed_pairs[form]:
                lemma_sources = processed_pairs[form][token.text]
                print(f"Found token '{token.text}'") if debug else None
            elif token.text.lower() in processed_pairs[form]:
                lemma_sources = processed_pairs[form][token.text.lower()]
                print(f"Found token '{token.text}' in lowercase") if debug else None
            if lemma_sources is not None:
                for lemma, sources in lemma_sources.items():
                    if lemma != token.lemma_:
                        corrections_list.append({
                            'sentence': doc.text,
                            'source_info': meta[0],
                            'token': token.text,
                            'lemma': token.lemma_,
                            'lemma_corrected': lemma,
                            'correction_source': ', '.join(sources)
                        })
                        token.lemma_ = lemma  # Correct the lemma in the Doc object
                        corrected_count += 1
                        print(f"Corrected token '{token.text}' to '{lemma}' in sentence '{doc.text}'") if debug else None
                        break
                else:
                    not_corrected_count += 1
                    print(f"Same Lemma, did not correct token '{token.text}' in sentence '{doc.text}'") if debug else None
            else:
                not_corrected_count += 1
                print(f"Couldn't find lemma in sources, did not correct token '{token.text}', {token.lemma_} in sentence '{doc.text}'") if debug else None
                # change the lemma to an empty string
                token.lemma_ = ""
                print(f"Token Lemma set to skip: {token.lemma_}") if debug else None
            # add source info to the token attributes
            
        # append the processed sentences to the appropriate list
        if form == 'NFKD':
            docs_nfkd.append(doc)
        if form == 'NFKC':
            docs_nfkc.append(doc)
    print(f"Corrected {corrected_count} tokens") if debug else None
    print(f"Did not correct {not_corrected_count} tokens") if debug else None

    return corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc


In [101]:
def test_corrections():
    # Get corrections for both normalization forms
    corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources, debug=True)

    # Convert the corrections lists to DataFrames
    corrections_df_nfkd = pd.DataFrame(corrections_nfkd)
    corrections_df_nfkc = pd.DataFrame(corrections_nfkc)

    print(f"Total corrections (NFKD): {len(corrections_df_nfkd)}")
    print(f"Total corrections (NFKC): {len(corrections_df_nfkc)}")
    
    # find how many corrected by each dictionary
    print(corrections_df_nfkd.groupby('correction_source').count())
    # total number of corrections
    print(corrections_df_nfkd.groupby('correction_source').count().sum())
    
        # find how many corrected by each dictionary
    print(corrections_df_nfkc.groupby('correction_source').count())
    # total number of corrections
    print(corrections_df_nfkc.groupby('correction_source').count().sum())


In [102]:
#test_corrections()

The spacy dataset should be exported to '../corpus/' folder.\
More specifically:\
train to '..corpus/train/lemma_train/'\
dev to '../corpus/dev/lemma_dev/'\
test to '../corpus/test/lemma_test/

In [103]:
from sklearn.model_selection import train_test_split
from pathlib import Path
from spacy.tokens import DocBin

# Function to split docs into train, dev, test sets and save them
def split_and_save_docs(sentences_list, nlp, processed_pairs_with_sources, base_path: str = "../corpus"):
    # Call process_sentences to get the processed docs and corrections
    corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources, debug=False)

    """
    Splits documents into train, test, and dev sets for both 'NFKD' and 'NFKC' normalization forms.
    Saves each set to disk in SpaCy's DocBin format.
    
    Args:
    - docs_nfkd: List of SpaCy Doc objects for 'NFKD' normalized text.
    - docs_nfkc: List of SpaCy Doc objects for 'NFKC' normalized text.
    - base_path: Base path to save the split documents.
    """
    assert docs_nfkd and docs_nfkc, "Document lists must not be empty"
    
    # Split documents for 'NFKD' normalization
    train_docs_nfkd, temp_docs_nfkd = train_test_split(docs_nfkd, test_size=0.2, random_state=42) # 80% for training
    test_docs_nfkd, dev_docs_nfkd = train_test_split(temp_docs_nfkd, test_size=0.5, random_state=42)  # Split the remaining 20% equally for testing and development

    # Split documents for 'NFKC' normalization
    train_docs_nfkc, temp_docs_nfkc = train_test_split(docs_nfkc, test_size=0.2, random_state=42) # 80% for training
    test_docs_nfkc, dev_docs_nfkc = train_test_split(temp_docs_nfkc, test_size=0.5, random_state=42)  # Split the remaining 20% equally for testing and development

    # Directories for saving split docs
    directories = ['train/lemma_train', 'dev/lemma_dev', 'test/lemma_test']
    forms = ['NFKD', 'NFKC']
    
    # Ensure directories exist
    for directory in directories:
        for form in forms:
            Path(f"{base_path}/{directory}/").mkdir(parents=True, exist_ok=True)

    # Function to save docs to disk
    def save_docs(docs, path):
        doc_bin = DocBin(docs=docs, store_user_data=True)
        doc_bin.to_disk(path)
    
    # Saving documents
    save_paths = {'train': (train_docs_nfkd, 'train'), 'dev': (dev_docs_nfkd, 'dev'), 'test': (test_docs_nfkd, 'test')}
    for split, (docs, subset) in save_paths.items():
        save_docs(docs, f"{base_path}/{split}/lemma_{split}/{subset}_lemma_NFKD.spacy")

    save_paths = {'train': (train_docs_nfkc, 'train'), 'dev': (dev_docs_nfkc, 'dev'), 'test': (test_docs_nfkc, 'test')}
    for split, (docs, subset) in save_paths.items():
        save_docs(docs, f"{base_path}/{split}/lemma_{split}/{subset}_lemma_NFKC.spacy")
    
    print(f"Documents are successfully split and saved for 'NFKD' and 'NFKC' forms.")

split_and_save_docs(sentences_list, nlp, processed_pairs_with_sources)

  0%|          | 0/1100 [00:00<?, ?it/s]

Documents are successfully split and saved for 'NFKD' and 'NFKC' forms.


In [104]:
# for complete dataset (non-split), uncomment the following lines

#corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources)

# save each one to DocBin
#def save_docs(docs, path):
#        doc_bin = DocBin(docs=docs)
#        doc_bin.to_disk(path)

#save_docs(docs_nfkd, "../corpus/train/lemma_train/NFKD/train_lemma_NFKD_full.spacy")
#save_docs(docs_nfkc, "../corpus/train/lemma_train/NFKD/train_lemma_NFKC_full.spacy")


## Process conllu greCy files

In [105]:
from spacy.tokens import DocBin
import spacy
from pathlib import Path

# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # Use your preferred model here


In [106]:
def modify_token_attributes(doc, debug=False):
    """
    Modify token attributes in a doc according to specific rules.
    """

    lemma_count = 0
    pos_count = 0
    dep_count = 0
    head_dep_count = 0
    
    for token in doc:
        
        # Lemmatizer rules: Set lemma to "" for trainable lemmatizer
        if token.lemma_ in ['', "_", '—', '-']:
            token.lemma_ = ''
            lemma_count += 1
            print(f"Adjusted lemma for token {token.text}, lemma: {token.lemma_}") if debug else None
            
        # Tagger rules: Set POS tags to "" (empty string)
        if token.pos_ in ['', "_", '—', '-', 'X', 'END', 'MID']:
            token.pos_ = ""  # This won't work as spaCy does not allow direct setting of .pos_ after Doc creation
            pos_count += 1
            print(f"Adjusted lemma for token {token.text}, POS: {token.pos_}") if debug else None


        ## Dependency parser rules: Set dep_ to "None" for empty strings
        if token.dep_ in ['', "_", '—', '-']:
            token.dep_ = "None"
            dep_count += 1
            print(f"Adjusted dependency for token {token.text}, dep: {token.dep_}") if debug else None
            
        if token.head.dep_ in ['', "_", '—', '-']:
            token.head.dep_ = "None"
            head_dep_count += 1
            print(f"Adjusted head dependency for token {token.head.text}, dep: {token.head.dep_}") if debug else None

    print(f"Total lemmas adjusted: {lemma_count}") if debug else None
    print(f"Total POS tags adjusted: {pos_count}") if debug else None
    print(f"Total dependencies adjusted: {dep_count}") if debug else None
    print(f"Total head dependencies adjusted: {head_dep_count}") if debug else None
    
    return doc

# Function to save processed docs to disk
def save_processed_docs(docs, path):
    doc_bin = DocBin(docs=docs, store_user_data=True)
    doc_bin.to_disk(path)
    print(f"Processed and saved {path}")

def process_and_save_docs(input_dir, base_path, nlp, debug=False):
    """
    Load spaCy Docs from .spacy files, modify token attributes, and save to the corresponding lemma_train/lemma_dev/lemma_test directory with the appropriate normalization form and preserving the original file name.
    """
    input_path = Path(input_dir)

    directories = ['train/lemma_train', 'dev/lemma_dev', 'test/lemma_test']
    forms = ['NFKD', 'NFKC']

    # Ensure directories exist
    for directory in directories:
        for form in forms:
            Path(f"{base_path}/{directory}/").mkdir(parents=True, exist_ok=True)

    for doc_file in input_path.glob("*.spacy"):
        doc_bin = DocBin().from_disk(doc_file)
        docs = list(doc_bin.get_docs(nlp.vocab))
        modified_docs = [modify_token_attributes(doc, debug=debug) for doc in docs]

        file_name = doc_file.name
        if "train" in file_name:
            if "NFKD" in file_name:
                output_path = f"{base_path}/train/lemma_train/{file_name.replace('.spacy', '_NFKD.spacy')}"
                save_processed_docs(modified_docs, output_path)
            elif "NFKC" in file_name:
                output_path = f"{base_path}/train/lemma_train/{file_name.replace('.spacy', '_NFKC.spacy')}"
                save_processed_docs(modified_docs, output_path)
        elif "dev" in file_name:
            if "NFKD" in file_name:
                output_path = f"{base_path}/dev/lemma_dev/{file_name.replace('.spacy', '_NFKD.spacy')}"
                save_processed_docs(modified_docs, output_path)
            elif "NFKC" in file_name:
                output_path = f"{base_path}/dev/lemma_dev/{file_name.replace('.spacy', '_NFKC.spacy')}"
                save_processed_docs(modified_docs, output_path)
        elif "test" in file_name:
            if "NFKD" in file_name:
                output_path = f"{base_path}/test/lemma_test/{file_name.replace('.spacy', '_NFKD.spacy')}"
                save_processed_docs(modified_docs, output_path)
            elif "NFKC" in file_name:
                output_path = f"{base_path}/test/lemma_test/{file_name.replace('.spacy', '_NFKC.spacy')}"
                save_processed_docs(modified_docs, output_path)
        else:
            print(f"Warning: Skipping file {file_name} as it does not contain 'train', 'dev', or 'test' in the name.")


# Example usage:
# nlp = spacy.blank("en")  # Use the appropriate language model
# process_and_save_docs("../assets/Lemmatization_training_files/Processed/lemma_train/spaCy", "../corpus/", nlp, debug=False)


# Example usage:
#nlp = spacy.blank("en")  # Use the appropriate language model
#process_and_save_docs("path/to/input/folder", "path/to/output/folder", nlp)

In [107]:
process_and_save_docs("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC/", "../corpus/", nlp, debug=False)
process_and_save_docs("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/", "../corpus/", nlp, debug=False)

process_and_save_docs("../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC/", "../corpus/", nlp, debug=False)
process_and_save_docs("../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/", "../corpus/", nlp, debug=False)


Processed and saved ../corpus//train/lemma_train/grc_proiel-ud-train_NFKC_NFKC.spacy
Processed and saved ../corpus//test/lemma_test/grc_proiel-ud-test_NFKC_NFKC.spacy
Processed and saved ../corpus//dev/lemma_dev/grc_proiel-ud-dev_NFKC_NFKC.spacy
Processed and saved ../corpus//test/lemma_test/grc_proiel-ud-test_NFKD_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/grc_proiel-ud-train_NFKD_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/grc_proiel-ud-dev_NFKD_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/grc_perseus-ud-dev_NFKC_NFKC.spacy
Processed and saved ../corpus//test/lemma_test/grc_perseus-ud-test_NFKC_NFKC.spacy
Processed and saved ../corpus//train/lemma_train/grc_perseus-ud-train_NFKC_NFKC.spacy
Processed and saved ../corpus//train/lemma_train/grc_perseus-ud-train_NFKD_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/grc_perseus-ud-dev_NFKD_NFKD.spacy
Processed and saved ../corpus//test/lemma_test/grc_perseus-ud-test_NFKD_NFKD.spacy


In [108]:
process_and_save_docs("../assets/Lemmatization_training_files/Processed/lemma_train/spaCy", "../corpus/", nlp, debug=False)

Processed and saved ../corpus//train/lemma_train/herodotus_NFKD_NFKD_train_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/pausanias_NFKD_NFKC_dev_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/orators_NFKD_NFKD_dev_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/plato_iii_NFKD_NFKC_train_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/orators_NFKD_NFKC_dev_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/pindar_NFKD_NFKD_train_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/arrian_NFKD_NFKD_train_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/xenophon_ii_NFKD_NFKD_train_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/plutarch_NFKD_NFKD_dev_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/plato_ii_NFKD_NFKD_train_NFKD.spacy
Processed and saved ../corpus//dev/lemma_dev/aristotle_NFKD_NFKD_dev_NFKD.spacy
Processed and saved ../corpus//train/lemma_train/euripides_NFKD_NFKD_train_NFKD.spacy
Processe

## Tests

In [109]:
# load a docbin
test_doc_bin = DocBin().from_disk("../corpus/train/lemma_train/grc_proiel-ud-train_NFKD_NFKD.spacy")
# get the docs
test_docs = list(test_doc_bin.get_docs(nlp.vocab))
# check the first doc
print(test_docs[0].text)


Ἡροδότου Ἁλικαρνησσέος ἱστορίης ἀπόδεξις ἥδε ὡς μήτε τὰ γενόμενα ἐξ ἀνθρώπων τῷ χρόνῳ ἐξίτηλα γένηται μήτε ἔργα μεγάλα τε καὶ θωμαστά τὰ μὲν Ἕλλησι τὰ δὲ βαρβάροισι ἀποδεχθέντα ἀκλεᾶ γένηται τά τε ἄλλα καὶ διʼ ἣν αἰτίην ἐπολέμησαν ἀλλήλοισι Περσέων μέν νυν οἱ λόγιοι Φοίνικας αἰτίους φασὶ γενέσθαι τῆς διαφορῆς τούτους γὰρ ἀπὸ τῆς Ἐρυθρῆς καλεομένης θαλάσσης ἀπικομένους ἐπὶ τήνδε τὴν θάλασσαν καὶ οἰκήσαντας τοῦτον τὸν χῶρον τὸν καὶ νῦν οἰκέουσι αὐτίκα ναυτιλίῃσι μακρῇσι ἐπιθέσθαι ἀπαγινέοντας δὲ φορτία Αἰγύπτιά τε καὶ Ἀσσύρια τῇ τε ἄλλῃ ἐσαπικνέεσθαι καὶ δὴ καὶ ἐς Ἄργος τὸ δὲ Ἄργος τοῦτον τὸν χρόνον προεῖχε ἅπασι τῶν ἐν τῇ νῦν Ἑλλάδι καλεομένῃ χωρῇ ἀπικομένους δὲ τούς Φοίνικας ἐς δὴ τὸ Ἄργος τοῦτο διατίθεσθαι τὸν φόρτον πέμπτῃ δὲ ἢ ἕκτῃ ἡμέρῃ ἀπʼ ἧς ἀπίκοντο ἐξεμπολημένων σφι σχεδόν πάντων ἐλθεῖν ἐπὶ τὴν θά

In [110]:
#  list all unique dep attributes in docbin
dep_attributes = set()
for doc in test_docs:
    for token in doc:
        dep_attributes.add(token.pos_)

        
dep_attributes



# for each word in the found doc, make a datafarame with attributes



{'',
 'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PRON',
 'PROPN',
 'SCONJ',
 'VERB'}

ner: use None as the IOB tag, as explained in Training NER on Incomplete Annotations #11114
tagger: use "" as the tag, e.g. ["", "V", "S", "J", ""]
parser: use None both for heads and deps e.g. [1, 1, 1, None] and ["nsubj", "ROOT", "dobj", None]
trainable_lemmatizer: set the lemma to "", e.g. ["", "like", "green", ""]

In [111]:
# load a docbin
test_doc_bin = DocBin().from_disk("../corpus/train/lemma_train/train_lemma_NFKD.spacy")
# get the docs
test_docs = list(test_doc_bin.get_docs(nlp.vocab))
# check the first doc
for doc in test_docs:
    print(doc.text)
    print(doc.user_data["source_info"])


Πάλιν δʼ ἐντεῦθεν εἰς τέτταρας σχίζονται φλέβας, ὧν μία μὲν ἐπανακάμψασα καταβαίνει διὰ τοῦ τραχήλου καὶ τοῦ ὤμου, καὶ συμβάλλει τῇ πρότερον ἀποσχίσει τῆς φλεβὸς κατὰ τὴν τοῦ βραχίονος καμπήν, τὸ δʼ ἕτερον μόριον εἰς τὴν χεῖρα τελευτᾷ καὶ τοὺς δακτύλους
Coda, Aristotle, History of Animals, B(3), C(3), P(136), L(11)
Ὅσα μὲν οὖν ἐστι ζῳοτόκα καὶ δίποδα ἢ τετράποδα, τούτων μὲν ἡ ὑστέρα πάντων ἐστὶ κάτω τοῦ ὑποζώματος, οἷον ἀνθρώπῳ καὶ κυνὶ καὶ ὑῒ καὶ ἵππῳ καὶ βοΐ·
('Inception', 'Aristotle_Historia_Animalium_510b5-20.xmi')
ἔτι δὲ μᾶλλον, ἐὰν καὶ τὸν πόδα κατὰ τὴν διάρθρωσιν ἀποτέμῃς.
('Inception', 'Galen, AA, 2, 4, 87 1-28 Garofalo. 295-296 Kühn.xmi')
Ἔχει δὲ διʼ αὑτοῦ καὶ φλέβας τεταμένας
Coda, Aristotle, History of Animals, B(1), C(17), P(84)
οἱ γὰρ δὴ τένοντες οἱ ὑμενώδεις οἱ περιλαμβάνοντες τοὺς ὀρθίους μυς, οὓς ἀπὸ τῶν λοξῶν μυῶν ἔφην γεννᾶσθαι, κατά

In [112]:
import os
import spacy
from spacy.tokens import DocBin
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # Use your preferred model here


In [113]:

def analyze_spacy_objects(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".spacy"):  # Check if the file is a spacy object
            tested_doc = DocBin().from_disk(os.path.join(folder_path, filename))  # Load the spacy object
            docs = list(tested_doc.get_docs(nlp.vocab))  # Convert generator to list to reuse it
            num_docs = len(docs)  # Calculate the number of documents
            total_length = sum(len(doc.text) for doc in docs)  # Calculate the total length
            avg_length = total_length / num_docs if num_docs > 0 else 0  # Calculate the average length
            print(f"Spacy object: {filename}")
            print(f"Number of documents: {num_docs}")
            print(f"Total length: {total_length}")
            print(f"Average length: {avg_length}\n")



In [114]:
# Call the function with the path to the folder containing the spacy objects
analyze_spacy_objects("../corpus/dev/lemma_dev")

Spacy object: pindar_NFKD_NFKD_dev_NFKD.spacy
Number of documents: 17
Total length: 15509
Average length: 912.2941176470588

Spacy object: sophocles_NFKD_NFKD_dev_NFKD.spacy
Number of documents: 55
Total length: 52357
Average length: 951.9454545454546

Spacy object: xenophon_iv_NFKD_NFKD_dev_NFKD.spacy
Number of documents: 24
Total length: 34163
Average length: 1423.4583333333333

Spacy object: herodotus_NFKD_NFKC_dev_NFKD.spacy
Number of documents: 101
Total length: 118180
Average length: 1170.09900990099

Spacy object: thucydides_NFKD_NFKD_dev_NFKD.spacy
Number of documents: 61
Total length: 119607
Average length: 1960.7704918032787

Spacy object: arrian_NFKD_NFKC_dev_NFKD.spacy
Number of documents: 34
Total length: 49670
Average length: 1460.8823529411766

Spacy object: apollonius_NFKD_NFKC_dev_NFKD.spacy
Number of documents: 32
Total length: 26425
Average length: 825.78125

Spacy object: plato_ii_NFKD_NFKC_dev_NFKD.spacy
Number of documents: 56
Total length: 58929
Average length: 1

In [115]:
import os
import spacy

def analyze_spacy_objects(folder_path):
    # Load the English language model
    nlp = spacy.load("grc_proiel_trf")

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith(".spacy"):
            # Load the Spacy object
            doc_bin = nlp.from_bytes(open(file_path, "rb").read())

            # Calculate the number of documents
            num_docs = len(list(doc_bin.sents))

            # Calculate the total length (in characters)
            total_length = sum(len(str(sent)) for sent in doc_bin.sents)

            # Calculate the average length of a document
            avg_length = total_length / num_docs

            print(f"File: {filename}")
            print(f"Number of documents: {num_docs}")
            print(f"Total length (in characters): {total_length}")
            print(f"Average length of a document: {avg_length:.2f}")
            print()

# Example usage
analyze_spacy_objects("../corpus/dev/lemma_dev")

ExtraData: unpack(b) received extra data.