In [1]:
import os
import re
import spacy
import pandas as pd

In [2]:
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # Use your preferred model here




# Preprocess Text

In [3]:
import unicodedata

def clean_text(text: str) -> str:
    """
    Cleans the given text by removing diacritics (accents), except for specific characters,
    and converting it to lowercase.
    
    Args:
        text (str): The text to clean.
        
    Returns:
        str: The cleaned text.
        
    Raises:
        ValueError: If the input is not a string.
    """
    if not isinstance(text, str):
        return text  # Return the original value if it's not a string
        #raise ValueError("Input must be a string.")
    
    try:
        non_accent_characters = [
            char for char in unicodedata.normalize('NFKD', text)
            if unicodedata.category(char) != 'Mn' or char == '̓'  # Keep Greek coronis
        ]
        # Use str.lower() for converting to lowercase, which works for Unicode characters
        return ''.join(non_accent_characters).lower()
    except Exception as e:
        # A more generic exception handling if unexpected errors occur
        print(f"An error occurred: {e}")
        return text

In [4]:
# Load the dataset
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
df = pd.read_csv(FILE_PATH)

In [5]:
df.columns

Index(['Word', 'Word Before', 'Word After', 'Quote', 'Category Types', 'Lemma',
       'Early Category Type', 'Early Word', 'Early Word Before',
       'Early Word After', 'Early Quote', 'Lemma arabic'],
      dtype='object')

In [6]:

# Load the dataset
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
df = pd.read_csv(FILE_PATH)

# Renaming columns
df.rename(columns={'Word': 'Keyword', 'Category Types': 'Label'}, inplace=True)

# Filling NaN values more efficiently and appropriately
for early_col, new_col in [('Early Quote', 'Quote'), ('Early Word Before', 'Word Before'), 
                           ('Early Word After', 'Word After'), ('Early Category Type', 'Label')]:
    df[new_col].fillna(df[early_col], inplace=True)

# Dropping rows with no Keyword and non-Greek Keywords
pat = '[ء-ي]+'
df = df.dropna(subset=['Keyword']).copy()
df = df[~df['Keyword'].str.contains(pat, na=False)]

# Cleaning data with combined regex patterns

# Define a dictionary of patterns and replacements for the entire dataframe
df_replacements = {
    '\d+': '',  # Numbers
    '-': '',  # Hyphens
    ' +': ' ',  # Multiple spaces
}

# Apply the replacements to the entire dataframe
for pattern, replacement in df_replacements.items():
    df.replace(pattern, replacement, regex=True, inplace=True)

# Define a dictionary of patterns and replacements for the 'Keyword' column
keyword_replacements = {
    '\n': '',  # New line
    ',': '',  # Comma
    '\.': '',  # Period
    '\·': '',  # Interpunkt
    '\s+$': ''  # End punctuation
}

# Apply the replacements to the 'Keyword' column
for pattern, replacement in keyword_replacements.items():
    df['Keyword'].replace(pattern, replacement, regex=True, inplace=True)
# Resetting the dataframe index
df.reset_index(drop=True, inplace=True)

# Cleaning all text in the dataframe. must be a string
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(clean_text)

In [7]:
df.head(10)
len(df)

1534

In [8]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", print the row and drop it
for i in range(len(df)):
    if df.iloc[i, 0] == "0" or df.iloc[i, 1] == "0" or df.iloc[i, 2] == "0" or df.iloc[i, 3] == "0":
        print(df.iloc[i])
        df.drop(i, inplace=True)

In [9]:
df.head(10)
len(df)

1534

In [10]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.tokens import Doc, DocBin
from unicodedata import normalize
import random


# Create dictionaries from dendrosearch and conllu files (supplied by Jacobo)

In [11]:
# create the coda dictionary for word : lemma 

coda_lemma_dict = df.dropna(subset=['Keyword', 'Lemma']).set_index('Keyword')['Lemma'].to_dict()

In [12]:
# create the dendrosearch dictionary for word : lemma 

# punctuation to be removed
PUNCTUATION = set(['.', ")", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"])

dendrosearch_lemma_dict = {} 
with open('../assets/dendrosearch_lemma_dict.txt', 'r', encoding='utf-8') as f:
    dendrosearch_lemma_dict = {line.split()[0]: line.split()[1] for line in f if len(line.split()) > 1 and line.split()[0] not in PUNCTUATION} 
    # clean the dictionary with clean_text
    dendrosearch_lemma_dict = {clean_text(k): clean_text(v) for k, v in dendrosearch_lemma_dict.items()}

In [13]:
dendrosearch_lemma_dict

{'θουκυδιδης': 'θουκυδιδης',
 'ἀθηναιος': 'ἀθηναιος',
 'ξυνεγραψε': 'συγγραφω',
 'τον': 'ο',
 'πολεμον': 'πολεμος',
 'των': 'ο',
 'πελοποννησιων': 'πελοποννησιος',
 'και': 'και',
 'ἀθηναιων': 'ἀθηναιος',
 'ως': 'ως',
 'ἐπολεμησαν': 'πολεμεω',
 'προς': 'προς',
 'ἀλληλους': 'ἀλληλων',
 'ἀρξαμενος': 'ἀρχω',
 'εὐθυς': 'εὐθυς',
 'καθισταμενου': 'καθιστημι',
 'ἐλπισας': 'ἐλπιζω',
 'μεγαν': 'μεγας',
 'τε': 'τε',
 'ἐσεσθαι': 'εἰμι',
 'ἀξιολογωτατον': 'ἀξιολογος',
 'προγεγενημενων': 'προγιγνομαι',
 'τεκμαιρομενος': 'τεκμαιρομαι',
 'οτι': 'οτι',
 'ἀκμαζοντες': 'ἀκμαζω',
 'ἠσαν': 'ἀω3',
 'ἐς': 'εἰς',
 'αὐτον': 'αὐτος',
 'ἀμφοτεροι': 'ἀμφοτερος',
 'παρασκευη': 'παρασκευη',
 'τη': 'ο',
 'παση': 'πας',
 'το': 'ο',
 'ἀλλο': 'ἀλλος',
 'ελληνικον': 'ελληνικος',
 'ορων': 'ορος',
 'ξυνισταμενον': 'συνιστημι',
 'εκατερους': 'εκατερος',
 'μεν': 'μεν',
 'δε': 'δε',
 'διανοουμενον': 'διανοεω',
 'κινησις': 'κινησις',
 'γαρ': 'γαρ',
 'αυτη': 'εαυτου',
 'μεγιστη': 'μεγιστη',
 'δη': 

In [14]:
import os
import glob  # Recommended for easy file pattern matching

PUNCTUATION = set(['.', ")", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"])  # Using a set for performance
PATH = "../assets/Lemmatization_training_files/test"
conllu_lemma_dict = {}

# Utilize glob for cleaner file selection
for file_path in glob.glob(os.path.join(PATH, '*.conllu')):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if not line.strip() or line.startswith('#'):
                continue  # Skip empty lines and comments
                
            parts = line.split()
            if len(parts) > 2 and parts[1] not in PUNCTUATION:
                # Ensure at least id, keyword, and lemma are present and keyword isn't punctuation
                conllu_lemma_dict[parts[1]] = parts[2]

## Create dictionary from INCEpTION files

In [15]:
from cassis import load_typesystem, load_cas_from_xmi
import zipfile
from pathlib import Path
import tempfile
from tqdm import tqdm

In [16]:
# Define paths using pathlib
inception_files_path = Path("../assets/NER_assets/INCEpTION_files/")
tempdir_path = Path(tempfile.mkdtemp())  # Create a temporary directory and get its path

try:
    # Extract all .zip files found in the inception files path
    for zip_file_path in inception_files_path.glob("*.zip"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(tempdir_path)

    typesystem_file_path = tempdir_path / "TypeSystem.xml"

    # Check for the existence of TypeSystem.xml before proceeding
    if not typesystem_file_path.exists():
        raise FileNotFoundError("TypeSystem.xml not found in the extracted files.")

    with open(typesystem_file_path, 'rb') as f:
        typesystem = load_typesystem(f)

    inception_dict = {}
    inception_sentences = []  # List of tuples (sentence, source_file)
    apostrophes = ["᾽", "᾿", "'", "’", "‘"]

    # Process each .xmi file found in the temporary directory
    for xmi_file_path in tempdir_path.glob("*.xmi"):
        with open(xmi_file_path, 'rb') as f:
            cas = load_cas_from_xmi(f, typesystem=typesystem)
            # Update inception_dict dictionary with lemmas
            inception_dict.update(
                {token.get_covered_text(): token.value for token in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma')}
            )
            # Extend inception_sentences list with sentences
            inception_sentences.extend(
                [(sentence.get_covered_text(), xmi_file_path.name) for sentence in cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")]
            )

    # Cleanup function added to ensure the temporary directory is always cleaned up
    def cleanup_tempdir(directory):
        for child in directory.iterdir():
            if child.is_file():
                child.unlink()
        directory.rmdir()

    # Clean sentence from line breaks, extra white spaces and replace koronis with apostrophe
    inception_sentences = [
        (clean_text(' '.join(sentence[0].replace('\r', ' ').replace('\n', ' ').translate(str.maketrans("".join(apostrophes), "ʼ" * len(apostrophes))).split())), sentence[1])
        for sentence in inception_sentences
    ]

except FileNotFoundError as e:
    print(f"Error: {e}")
    cleanup_tempdir(tempdir_path)
    # Exit or raise the error for further handling depending on the script's usage
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    cleanup_tempdir(tempdir_path)
    # Exit or raise the error for further handling depending on the script's usage
else:
    # This block runs if no exceptions were raised
    # Print or process the inception_dict and inception_sentences as needed
    # Example: print some of the processed sentences
    for sentence, source in inception_sentences[:5]:
        print(f"Sentence: {sentence}, Source File: {source}")
finally:
    # Ensure the temporary directory is always cleaned up
    cleanup_tempdir(tempdir_path)


Sentence: υπο δε το υποζωμα κειται η κοιλια τοις ζωοις, τοις μεν ἐχουσιν οἰσοφαγον η τελευτα τουτο το μοριον, τοις δε μη ἐχουσιν εὐθυς προς τω στοματι·, Source File: Aristotle_Partibus_Animalium_674a9-16.xmi
Sentence: της δε κοιλιας ἐχομενον το καλουμενον ἐντερον., Source File: Aristotle_Partibus_Animalium_674a9-16.xmi
Sentence: διʼ ην δʼ αἰτιαν ἐχει ταυτα τα μορια των ζωων εκαστον, φανερον πασιν., Source File: Aristotle_Partibus_Animalium_674a9-16.xmi
Sentence: και γαρ δεξασθαι την εἰσελθουσαν τροφην και την ἐξικμασμενην ἀναγκαιον ἐκπεμψαι, και μη τον αὐτον τοπον εἰναι της τε ἀπεπτου και του περιττωματος, εἰναι τε τινα δει τοπον ἐν ω μεταβαλλει., Source File: Aristotle_Partibus_Animalium_674a9-16.xmi
Sentence: προς δε το γονυ το ὀστεον του μηρου τοιονδʼ ἐστι δικραιον·, Source File: hippocrates places in man 6.9-10.xmi


## Create large dictionary from all sources

In [17]:
import json
import unicodedata
import re


# Consolidating various apostrophe characters
apostrophes = ["᾽", "᾿", "'", "’", "‘"]
correct_apostrophe = "ʼ"

big_dict = {
    'Conllu': conllu_lemma_dict,
    'Inception': inception_dict,
    'Coda': coda_lemma_dict,
    'Dendrosearch': dendrosearch_lemma_dict
}

# Utility function for text normalization and apostrophe replacement
def remove_brackets(word):
    # Removes content within brackets and the brackets themselves
    return re.sub(r'[\(\)\[\]]', '', word)

def normalize_and_correct(text, form):
    normalized = unicodedata.normalize(form, text)
    normalized = normalized.lower()  # Convert all text to lowercase
    for char in apostrophes:
        normalized = normalized.replace(char, correct_apostrophe)
    normalized = re.sub(r'^\d+|\d+$', '', normalized)  # Remove trailing numbers from the lemma
    without_brackets = remove_brackets(normalized)  # Remove brackets after normalization and apostrophe correction
    return without_brackets

In [18]:
# Placeholder structure for the processed pairs
processed_pairs_with_sources = {
    'NFKD': {},
    'NFKC': {}
}

for form in ['NFKD', 'NFKC']:
    for source, lemma_dict in big_dict.items():
        for word, lemma in lemma_dict.items():
            norm_word = normalize_and_correct(word, form)
            norm_lemma = normalize_and_correct(lemma, form)

            # Skip empty normalized word entries
            if not norm_word or norm_lemma in ["_", " ", ""]:  
                continue

            if norm_word not in processed_pairs_with_sources[form]:
                processed_pairs_with_sources[form][norm_word] = {}

            # Aggregate lemmas with their sources
            if norm_lemma in processed_pairs_with_sources[form][norm_word]:
                processed_pairs_with_sources[form][norm_word][norm_lemma].add(source)
            else:
                processed_pairs_with_sources[form][norm_word][norm_lemma] = {source}

In [19]:
import json
import random

processed_counter = 0  # Counter for processed words
no_lemma_counter = 0  # Counter for words with no lemma
deleted_counter = 0  # Counter for deleted words

for form, word_lemmas in processed_pairs_with_sources.items():
    keys_to_delete = []  # List to store keys of pairs to be deleted
    for word, lemmas in word_lemmas.items():
        most_common_lemma = None
        most_common_count = 0
        source_info_for_most_common = set()
        tied_lemmas = []  # For handling ties
        
        for lemma, sources in lemmas.items():
            current_count = len(sources)
            if current_count > most_common_count:
                most_common_lemma = lemma
                most_common_count = current_count
                source_info_for_most_common = sources
                tied_lemmas = [lemma]  # Reset ties because a new max is found
            elif current_count == most_common_count:
                tied_lemmas.append(lemma)  # Add lemma to ties
        
        # Handle ties: choose the lexically first lemma if there's a tie
        if len(tied_lemmas) > 1:
            tied_lemmas.sort()  # Sort the list to ensure consistent processing
            most_common_lemma = tied_lemmas[0]  # Lexically first lemma
            # Combine sources of tied lemmas since more than one had the "most common" status
            source_info_for_most_common = set().union(*(lemmas[lemma] for lemma in tied_lemmas))
            
        if most_common_lemma:
            processed_pairs_with_sources[form][word] = {most_common_lemma: source_info_for_most_common}
            processed_counter += 1
        else:
            keys_to_delete.append(word)
            no_lemma_counter += 1

    # Delete marked words outside the loop
    for word in keys_to_delete:
        del processed_pairs_with_sources[form][word]
        deleted_counter += 1

print(f"Total processed words: {processed_counter}")
print(f"Total words with no lemma: {no_lemma_counter}")
print(f"Total deleted words: {deleted_counter}")

# Function to save the processed_pairs_with_sources to a json file
def save_processed_pairs(processed_pairs_with_sources, filename):
    # Convert sets to lists for JSON serialization
    for form, word_lemmas in processed_pairs_with_sources.items():
        for word, lemmas in word_lemmas.items():
            for lemma, sources in lemmas.items():
                lemmas[lemma] = list(sources)

    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(processed_pairs_with_sources, f, ensure_ascii=False)

# Example usage of save function
save_processed_pairs(processed_pairs_with_sources, 'processed_pairs_with_sources.json')

Total processed words: 389296
Total words with no lemma: 0
Total deleted words: 0


# Run NLP pipeline on INCEpTION and Coda files

In [20]:
import spacy
from collections import OrderedDict

# Assuming the model is correctly installed:
nlp = spacy.load("grc_proiel_trf")  # Use your preferred model here



In [21]:
apostrophes = ["᾽", "᾿", "'", "’", "‘"]
apostrophe_map = str.maketrans("".join(apostrophes), "ʼ" * len(apostrophes))

# Assuming 'inception_sentences' is a list of tuples like (sentence_text, source_file),
# and 'df' is your DataFrame containing Coda sentences.
sentences = OrderedDict()

# Add inception sentences with source file
for sentence_text, source_file in inception_sentences:
    cleaned_sentence = ' '.join(sentence_text.translate(apostrophe_map).split()).strip()
    # (sentence, source_type, source_file)
    sentences[cleaned_sentence] = ('Inception', source_file)

# Add Coda sentences from DataFrame
for sentence in df['Quote'].tolist():
    cleaned_sentence = ' '.join(sentence.translate(apostrophe_map).split()).strip()
    
    if cleaned_sentence not in sentences:
        # No explicit source file for Coda, assuming not available or using a generic placeholder if needed
        sentences[cleaned_sentence] = ('Coda', 'Unknown/Not Applicable')

# Convert OrderedDict back to a list of tuples if needed for further processing
# (sentence, source_type, source_file)
sentences_list = list(sentences.items())

In [22]:
import pandas as pd
from tqdm.auto import tqdm
from collections import OrderedDict
from typing import List, Dict
from spacy.tokens import Doc
from spacy.language import Language
from unicodedata import normalize

# Assuming 'processed_pairs_with_sources', 'nlp' (a SpaCy Language model), and 'sentences_list' are already defined

def process_sentences(sentences_list, nlp, processed_pairs):
    docs_nfkd: List[Doc] = []
    docs_nfkc: List[Doc] = []

    corrections_nfkd = []
    corrections_nfkc = []
    
    sentences_and_metadata = [
        (normalize(form, sentence), (source_type, source_file, form))
        for sentence, (source_type, source_file) in sentences_list
        for form in ['NFKD', 'NFKC']
    ]

    sentences_for_processing = [sentence for sentence, _ in sentences_and_metadata]
    metadata = [meta for _, meta in sentences_and_metadata]

    for doc, meta in tqdm(zip(nlp.pipe(sentences_for_processing), metadata), total=len(sentences_for_processing)):
        form = meta[2]  # 'NFKD' or 'NFKC'
        
        # Determine the correct list to append corrections based on the form
        corrections_list = corrections_nfkd if form == 'NFKD' else corrections_nfkc
        
        for token in doc:
            if token.text in processed_pairs[form]:
                lemma_sources = processed_pairs[form][token.text]
                for lemma, sources in lemma_sources.items():
                    if lemma != token.lemma_:
                        corrections_list.append({
                            'sentence': doc.text,
                            'source_type': meta[0],
                            'source_file': meta[1],
                            'token': token.text,
                            'lemma': token.lemma_,
                            'lemma_corrected': lemma,
                            'correction_source': ', '.join(sources)
                        })
                        token.lemma_ = lemma  # Correct the lemma in the Doc object
                        break
        # append the processed sentences to the appropriate list
        if form == 'NFKD':
            docs_nfkd.append(doc)
        if form == 'NFKC':
            docs_nfkc.append(doc)

    return corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc


In [None]:

# Get corrections for both normalization forms
corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources)

# Convert the corrections lists to DataFrames
corrections_df_nfkd = pd.DataFrame(corrections_nfkd)
corrections_df_nfkc = pd.DataFrame(corrections_nfkc)

print(f"Total corrections (NFKD): {len(corrections_df_nfkd)}")
print(f"Total corrections (NFKC): {len(corrections_df_nfkc)}")

In [None]:
# find how many corrected by each dictionary
corrections_df_nfkd.groupby('correction_source').count()

In [None]:
corrections_df_nfkc.groupby('correction_source').count()

The spacy dataset should be exported to '../corpus/' folder.\
More specifically:\
train to '..corpus/train/lemma_train/'\
dev to '../corpus/dev/lemma_dev/'\
test to '../corpus/test/lemma_test/

In [23]:
from sklearn.model_selection import train_test_split
from pathlib import Path
from spacy.tokens import DocBin

# Function to split docs into train, dev, test sets and save them
def split_and_save_docs(sentences_list, nlp, processed_pairs_with_sources, base_path: str = "../corpus"):
    # Call process_sentences to get the processed docs and corrections
    corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources)

    """
    Splits documents into train, test, and dev sets for both 'NFKD' and 'NFKC' normalization forms.
    Saves each set to disk in SpaCy's DocBin format.
    
    Args:
    - docs_nfkd: List of SpaCy Doc objects for 'NFKD' normalized text.
    - docs_nfkc: List of SpaCy Doc objects for 'NFKC' normalized text.
    - base_path: Base path to save the split documents.
    """
    assert docs_nfkd and docs_nfkc, "Document lists must not be empty"
    
    # Split documents for 'NFKD' normalization
    train_docs_nfkd, test_docs_nfkd = train_test_split(docs_nfkd, test_size=0.2, random_state=42)
    train_docs_nfkd, dev_docs_nfkd = train_test_split(train_docs_nfkd, test_size=0.25, random_state=42)  # Note the adjustment to 0.25 for dev to make the split 60/20/20

    # Split documents for 'NFKC' normalization
    train_docs_nfkc, test_docs_nfkc = train_test_split(docs_nfkc, test_size=0.2, random_state=42)
    train_docs_nfkc, dev_docs_nfkc = train_test_split(train_docs_nfkc, test_size=0.25, random_state=42)

    # Directories for saving split docs
    directories = ['train/lemma_train', 'dev/lemma_dev', 'test/lemma_test']
    forms = ['NFKD', 'NFKC']
    
    # Ensure directories exist
    for directory in directories:
        for form in forms:
            Path(f"{base_path}/{directory}/{form}").mkdir(parents=True, exist_ok=True)

    # Function to save docs to disk
    def save_docs(docs, path):
        doc_bin = DocBin(docs=docs)
        doc_bin.to_disk(path)
    
    # Saving documents
    save_paths = {'train': (train_docs_nfkd, 'train'), 'dev': (dev_docs_nfkd, 'dev'), 'test': (test_docs_nfkd, 'test')}
    for split, (docs, subset) in save_paths.items():
        save_docs(docs, f"{base_path}/{split}/lemma_{split}/{subset}_lemma_NFKD.spacy")

    save_paths = {'train': (train_docs_nfkc, 'train'), 'dev': (dev_docs_nfkc, 'dev'), 'test': (test_docs_nfkc, 'test')}
    for split, (docs, subset) in save_paths.items():
        save_docs(docs, f"{base_path}/{split}/lemma_{split}/{subset}_lemma_NFKC.spacy")
    
    print(f"Documents are successfully split and saved for 'NFKD' and 'NFKC' forms.")

split_and_save_docs(sentences_list, nlp, processed_pairs_with_sources)

  0%|          | 0/916 [00:00<?, ?it/s]

Documents are successfully split and saved for 'NFKD' and 'NFKC' forms.


In [None]:
# for complete dataset (non-split)

#corrections_nfkd, corrections_nfkc, docs_nfkd, docs_nfkc = process_sentences(sentences_list, nlp, processed_pairs_with_sources)

# save each one to DocBin
#def save_docs(docs, path):
#        doc_bin = DocBin(docs=docs)
#        doc_bin.to_disk(path)

#save_docs(docs_nfkd, "../corpus/train/lemma_train/NFKD/train_lemma_NFKD_full.spacy")
#save_docs(docs_nfkc, "../corpus/train/lemma_train/NFKD/train_lemma_NFKC_full.spacy")


## Process conllu greCy files

In [25]:
from spacy.tokens import DocBin
import spacy
from pathlib import Path

def modify_token_attributes(doc):
    """
    Modify token attributes in a doc according to specific rules.
    """
    for token in doc:
        
        # Lemmatizer rules: Set lemma to "" for trainable lemmatizer
        if token.lemma_ in ['', "_", '—', '-']:
            token.lemma_ = ''
            print(f"Adjusted lemma for token {token.text}, lemma: {token.lemma_}")
            
        # Tagger rules: Set POS tags to "" (empty string)
        if token.pos_ in ['', "_", '—', '-', 'X', 'END', 'MID']:
            token.pos_ = ""  # This won't work as spaCy does not allow direct setting of .pos_ after Doc creation
            print(f"Adjusted lemma for token {token.text}, POS: {token.pos_}")


        ## Dependency parser rules: Set dep_ to "None" for empty strings
        if token.dep_ in ['', "_", '—', '-']:
            token.dep_ = "None"
            
        if token.head.dep_ in ['', "_", '—', '-']:
            token.head.dep_ = "None"
        # Note: Direct modification of .dep_ and entity annotations can follow similar patterns

    return doc

def process_and_save_docs(input_dir, output_dir, nlp):
    """
    Load spaCy Docs from .spacy files, modify token attributes, and save to new .spacy files.
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for doc_file in input_path.glob("*.spacy"):
        doc_bin = DocBin().from_disk(doc_file)
        output_bin = DocBin()

        for doc in doc_bin.get_docs(nlp.vocab):
            modified_doc = modify_token_attributes(doc)
            output_bin.add(modified_doc)

        output_filename = output_path / doc_file.name
        output_bin.to_disk(output_filename)
        print(f"Processed and saved {output_filename}")

# Example usage:
nlp = spacy.blank("en")  # Use the appropriate language model
#process_and_save_docs("path/to/input/folder", "path/to/output/folder", nlp)

In [26]:
process_and_save_docs("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC/", "../corpus/train/lemma_train", nlp)
process_and_save_docs("../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/", "../corpus/train/lemma_train", nlp)

process_and_save_docs("../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC/", "../corpus/train/lemma_train", nlp)
process_and_save_docs("../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/", "../corpus/train/lemma_train", nlp)


Adjusted lemma for token ηλει, POS: 
Adjusted lemma for token ηλει, POS: 
Adjusted lemma for token λεμα, POS: 
Adjusted lemma for token σαβαχθανει, POS: 
Adjusted lemma for token κορβαν, POS: 
Adjusted lemma for token ἐφφαθα, POS: 
Adjusted lemma for token ελωι, POS: 
Adjusted lemma for token ελωι, POS: 
Adjusted lemma for token λεμα, POS: 
Adjusted lemma for token σαβαχθανει, POS: 
Adjusted lemma for token σαβαωθ, POS: 
Adjusted lemma for token μαραν, POS: 
Adjusted lemma for token ἀθα, POS: 
Adjusted lemma for token σαβαωθ, POS: 
Processed and saved ../corpus/train/lemma_train/grc_proiel-ud-train_NFKC.spacy
Adjusted lemma for token σαν, POS: 
Processed and saved ../corpus/train/lemma_train/grc_proiel-ud-test_NFKC.spacy
Adjusted lemma for token σπακα, POS: 
Adjusted lemma for token ρακα, POS: 
Adjusted lemma for token ταλιθα, POS: 
Adjusted lemma for token κουμ, POS: 
Processed and saved ../corpus/train/lemma_train/grc_proiel-ud-dev_NFKC.spacy
Adjusted lemma for token σαν, POS: 
Proce

In [27]:
process_and_save_docs("../assets/Lemmatization_training_files/test/lemma_train/spaCy", "../corpus/train/lemma_train", nlp)

Adjusted lemma for token μιμεισθαι, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token γενοισθην, lemma: 
Adjusted lemma for token κλειτοφων, lemma: 
Adjusted lemma for token μαχεισθαι, lemma: 
Adjusted lemma for token ἰωμενος, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token μειζονι, lemma: 
Adjusted lemma for token μυθολογουντες, lemma: 
Adjusted lemma for token δημιουργον, lemma: 
Adjusted lemma for token τοιαυται, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token μεγαροι, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token _, lemma: 
Adjusted lemma for token εὐρυθμον, lemma: 
Adjusted lemma for token ηττον, lemma: 
Adjusted lemma for token βιωτον, lemma: 
Adjusted lemma for token απτεον, lemma: 
Adjusted lemma for token ἀρεως, lemma: 
Adjusted lemma for token φυλατ

## Tests

In [None]:
# load a docbin
test_doc_bin = DocBin().from_disk("../corpus/train/lemma_train/grc_proiel-ud-train_NFKD.spacy")
# get the docs
test_docs = list(test_doc_bin.get_docs(nlp.vocab))
# check the first doc
print(test_docs[0].text)


In [None]:
#  list all unique dep attributes in docbin
dep_attributes = set()
for doc in test_docs:
    for token in doc:
        dep_attributes.add(token.pos_)

        
dep_attributes



# for each word in the found doc, make a datafarame with attributes



In [None]:
ner: use None as the IOB tag, as explained in Training NER on Incomplete Annotations #11114
tagger: use "" as the tag, e.g. ["", "V", "S", "J", ""]
parser: use None both for heads and deps e.g. [1, 1, 1, None] and ["nsubj", "ROOT", "dobj", None]
trainable_lemmatizer: set the lemma to "", e.g. ["", "like", "green", ""]