In [2]:
import os
import conllu
#from conllu import parse
import spacy


In [2]:
# if using vectors
#import spacy
#from spacy.language import Language
#spacy.prefer_gpu()
#nlp = spacy.load("grc_proiel_trf")
#!python -m spacy init vectors grc ../assets/grc_floret_cbow_nn2_xn10_b200k_dim300.floret ../vectors/large --mode floret

In [3]:
import unicodedata

def clean_text(text: str) -> str:
    """
    Cleans the given text by removing diacritics (accents), except for specific characters,
    and converting it to lowercase.
    
    Args:
        text (str): The text to clean.
        
    Returns:
        str: The cleaned text.
        
    Raises:
        ValueError: If the input is not a string.
    """
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    
    try:
        non_accent_characters = [
            char for char in unicodedata.normalize('NFKD', text)
            if unicodedata.category(char) != 'Mn' or char == '̓'  # Keep Greek coronis
        ]
        # Use str.lower() for converting to lowercase, which works for Unicode characters
        return ''.join(non_accent_characters).lower()
    except Exception as e:
        # A more generic exception handling if unexpected errors occur
        print(f"An error occurred: {e}")
        return text

In [4]:
import conllu
import os

def process_sentences(input_files, output_file, combine=False):
    """
    Processes .conllu files: cleans text, separates sentences based on punctuation,
    and optionally combines multiple .conllu files.

    Args:
        input_files (list): List of paths to input .conllu files.
        output_file (str): Path to the output .conllu file.
        combine (bool, optional): Whether to combine input files. Defaults to False.
    """
    all_sentences = []

    # Loop through each input file, whether combining or not
    for input_file in input_files:
        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = conllu.parse(text)
        if combine:
            all_sentences.extend(sentences)  # Combine sentences from all files
        else:
            all_sentences = sentences  # Use sentences from the current file only
            break  # Exit loop after the first file if not combining

    rebuilt_sentences = []
    sent_id = 1
    current_sentence_tokens = []
    token_id = 1  # Initialize token id for running numbers throughout each sentence

    for sentence in all_sentences:
        for token in sentence:
            # Clean the text for 'form' and 'lemma'
            token['form'] = clean_text(token['form'])
            token['lemma'] = clean_text(token['lemma'])
            # Update current token's id to ensure running numbers
            token['id'] = token_id
            
            # Add the token to the current sentence tokens and increment token_id
            current_sentence_tokens.append(token)
            token_id += 1            
            
            # Check if the current token is punctuation that indicates end of a sentence
            if token["form"] in [".", "·"]:
                # Append the current sentence tokens as a new TokenList to rebuilt_sentences
                metadata = {"sent_id": str(sent_id), "text": "NA"}
                rebuilt_sentences.append(conllu.TokenList(tokens=current_sentence_tokens, metadata=metadata))
                current_sentence_tokens = []  # Reset for the next sentence
                sent_id += 1
                token_id = 1  # Reset token id for the new sentence
                
    # Finalize the last sentence if it doesn't end with specified punctuation
    if current_sentence_tokens:
        metadata = {"sent_id": str(sent_id), "text": "NA"}
        rebuilt_sentences.append(conllu.TokenList(tokens=current_sentence_tokens, metadata=metadata))
        
    # Write rebuilt sentences to the output file
    with open(output_file, "w", encoding="utf-8") as out_f:
        for sentence in rebuilt_sentences:
            out_f.write(sentence.serialize())
            out_f.write("\n\n")  # Ensure correct .conllu formatting with blank lines

    # Print summary
    summary = "Combined" if combine else "Original"
    print(f"{summary} number of sentences from input files: {len(all_sentences)}")
    print(f"Rebuilt and separated {len(rebuilt_sentences)} sentences.\n")


# Example usage for processing a single file
# process_sentences(["input_file.conllu"], "output_file.conllu", combine=False)

# Example usage for combining multiple files into one output file
# process_sentences(["input_file1.conllu", "input_file2.conllu"], "combined_output.conllu", combine=True)

In [5]:
directory = "../assets/Lemmatization_training_files"
for entry in os.listdir(directory):
    full_path = os.path.join(directory, entry)
    if os.path.isfile(full_path) and entry.endswith(".conllu"):
        process_sentences([full_path], "../assets/Lemmatization_training_files/test/" + entry[:-7] + "_no_accents_NFKD.conllu")

Original number of sentences from input files: 3180
Rebuilt and separated 11680 sentences.

Original number of sentences from input files: 3577
Rebuilt and separated 6055 sentences.

Original number of sentences from input files: 2763
Rebuilt and separated 5860 sentences.

Original number of sentences from input files: 437
Rebuilt and separated 2695 sentences.

Original number of sentences from input files: 1
Rebuilt and separated 305 sentences.

Original number of sentences from input files: 4338
Rebuilt and separated 10022 sentences.

Original number of sentences from input files: 2896
Rebuilt and separated 1292 sentences.

Original number of sentences from input files: 1417
Rebuilt and separated 4615 sentences.

Original number of sentences from input files: 1476
Rebuilt and separated 3732 sentences.

Original number of sentences from input files: 3363
Rebuilt and separated 3363 sentences.

Original number of sentences from input files: 5834
Rebuilt and separated 3154 sentences.

Or

In [6]:
import os
import random
import unicodedata
import conllu
from pathlib import Path

def normalize_text(text, normalization_form='NFKD'):
    """
    Normalize the given text using specified Unicode normalization form.
    """
    return unicodedata.normalize(normalization_form, text)

def adjust_tokens_for_spacy(sentences):
    """
    Adjusts tokens in parsed sentences for spaCy's trainable lemmatizer requirements
    and handles specific token conditions, setting appropriate defaults.
    """
    for sentence in sentences:
        for token in sentence:
            # Adjustments for forms and lemmas
            if token["form"] in ['', "_", '—', '-']:
                token["form"] = token["lemma"] if token["lemma"] not in ['', "_", '—', '-'] else "_"
            if token["lemma"] in ['', "_", '—', '-']:
                token["lemma"] = "_"
            
            # ID and UPOS adjustments
            if token["id"] == '':
                token["id"] = "UNK"  # Example arbitrary value for unknown IDs
            if token["upos"] in ['', "_", '—', '-']:
                token["upos"] = "_"  # Use '' for as per spaCy standard or 'X' for unknown UPOS as per CoNLL-U standard
            if token["upos"] in ['END', 'MID']:
                token["upos"] = "NOUN"  # Correcting specific UPOS conditions

    return sentences

def process_and_normalize_files(input_directory, output_directory, normalization_form='NFKD'):
    """
    Process .conllu files in the given directory, normalize text according
    to the specified normalization form, and split data into training and
    development sets, with conditions adjusted for spaCy's requirements.
    """
    Path(output_directory).mkdir(parents=True, exist_ok=True)
    # Check if input directory exists
    if not os.path.exists(input_directory):
        print(f"Error: The input directory '{input_directory}' does not exist.")
        return

    # Check if output directory exists, create if not
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Created output directory: {output_directory}")
    
    # Process and normalize each .conllu file in the input directory
    for file_name in os.listdir(input_directory):
        if file_name.endswith(".conllu"):
            print("\n", file_name)
            
            # Read file
            file_path = os.path.join(input_directory, file_name)
            sentences = conllu.parse(open(file_path, "r", encoding="utf-8").read())
            
            # Adjust tokens before normalization and spaCy conversion
            sentences = adjust_tokens_for_spacy(sentences)
            
            # Parse sentences
            for sentence in sentences:
                for token in sentence:
                    # Apply normalization to token form and lemma
                    token["form"] = normalize_text(token["form"], normalization_form)
                    token["lemma"] = normalize_text(token["lemma"], normalization_form)
            
            # Set the seed for reproducibility
            random.seed(42)
            # Shuffle the sentences randomly
            random.shuffle(sentences)

            # Split the sentences into training and development data
            split_index = int(len(sentences) * 0.8)
            train_data, dev_data = sentences[:split_index], sentences[split_index:]
            
            # Write training and development data
            train_output_file = os.path.join(output_directory, f"{file_name[:-7]}_{normalization_form}_train.conllu")
            dev_output_file = os.path.join(output_directory, f"{file_name[:-7]}_{normalization_form}_dev.conllu")
            with open(train_output_file, "w", encoding="utf-8") as train_file:
                for sentence in train_data:
                    train_file.write(sentence.serialize())
            with open(dev_output_file, "w", encoding="utf-8") as dev_file:
                for sentence in dev_data:
                    dev_file.write(sentence.serialize())
            
            print(f"Processed and normalized {file_name}. Train and dev data saved.")

# Example usage:
# Make sure to specify your actual paths for the input directory and output directory
# process_and_normalize_files("../assets/Lemmatization_training_files/test", "../assets/Lemmatization_training_files/lemma_train", "NFKD")

In [7]:
process_and_normalize_files("../assets/Lemmatization_training_files/test", "../assets/Lemmatization_training_files/test/lemma_train", "NFKD")


 euripides_no_accents_NFKD.conllu
Processed and normalized euripides_no_accents_NFKD.conllu. Train and dev data saved.

 plato_ii_no_accents_NFKD.conllu
Processed and normalized plato_ii_no_accents_NFKD.conllu. Train and dev data saved.

 lucian_no_accents_NFKD.conllu
Processed and normalized lucian_no_accents_NFKD.conllu. Train and dev data saved.

 plato_iii_no_accents_NFKD.conllu
Processed and normalized plato_iii_no_accents_NFKD.conllu. Train and dev data saved.

 orators_no_accents_NFKD.conllu
Processed and normalized orators_no_accents_NFKD.conllu. Train and dev data saved.

 homer_no_accents_NFKD.conllu
Processed and normalized homer_no_accents_NFKD.conllu. Train and dev data saved.

 plutarch_no_accents_NFKD.conllu
Processed and normalized plutarch_no_accents_NFKD.conllu. Train and dev data saved.

 thucydides_no_accents_NFKD.conllu
Processed and normalized thucydides_no_accents_NFKD.conllu. Train and dev data saved.

 aristotle_no_accents_NFKD.conllu
Processed and normalized 

In [8]:
process_and_normalize_files("../assets/Lemmatization_training_files/test", "../assets/Lemmatization_training_files/test/lemma_train", "NFKC")


 euripides_no_accents_NFKD.conllu
Processed and normalized euripides_no_accents_NFKD.conllu. Train and dev data saved.

 plato_ii_no_accents_NFKD.conllu
Processed and normalized plato_ii_no_accents_NFKD.conllu. Train and dev data saved.

 lucian_no_accents_NFKD.conllu
Processed and normalized lucian_no_accents_NFKD.conllu. Train and dev data saved.

 plato_iii_no_accents_NFKD.conllu
Processed and normalized plato_iii_no_accents_NFKD.conllu. Train and dev data saved.

 orators_no_accents_NFKD.conllu
Processed and normalized orators_no_accents_NFKD.conllu. Train and dev data saved.

 homer_no_accents_NFKD.conllu
Processed and normalized homer_no_accents_NFKD.conllu. Train and dev data saved.

 plutarch_no_accents_NFKD.conllu
Processed and normalized plutarch_no_accents_NFKD.conllu. Train and dev data saved.

 thucydides_no_accents_NFKD.conllu
Processed and normalized thucydides_no_accents_NFKD.conllu. Train and dev data saved.

 aristotle_no_accents_NFKD.conllu
Processed and normalized 

## Preparing spaCy Files

In [9]:
import os
import conllu
import subprocess

def validate_head_indices(sentences):
    """
    Validates that all head indices in the tokens of the sentences are within the valid range.

    Args:
        sentences (List[TokenList]): List of sentences parsed from a .conllu file.

    Returns:
        bool: True if all head indices are valid, False otherwise.
    """
    for sentence in sentences:
        token_ids = {token["id"] for token in sentence}  # Set of valid token IDs for reference
        for token in sentence:
            # Assuming head is directly accessible in token and is an int
            head = token.get("head")  # Use .get() to safely handle missing 'head' entries
            
            # Check if head exists or is set to None
            if head is None:
                print(f"Missing head for token '{token['form']}' in sentence: {sentence.metadata.get('text', 'NA')}")
                return False

            # Check if head index is within the valid range or is a root (0)
            if head not in token_ids and head != 0:
                print(f"Invalid head index {head} for token '{token['form']}' in sentence: {sentence.metadata.get('text', 'NA')}")
                return False
    
    return True

def read_and_parse_conllu(file_path):
    """
    Reads and parses a .conllu file from the given path.
    """
    
    Path(output_directory).mkdir(parents=True, exist_ok=True)
    # Check if input directory exists
    if not os.path.exists(input_directory):
        print(f"Error: The input directory '{input_directory}' does not exist.")
        return

    # Check if output directory exists, create if not
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Created output directory: {output_directory}")

    with open(file_path, "r", encoding="utf-8") as data:
        annotations = data.read()
    return conllu.parse(annotations)

def convert_to_spacy(file_path, output_directory, sentences):
    """
    Converts the .conllu file to spaCy format if head indices are valid.
    """
    extra_args = "--n-sents 10" if len(sentences) >= 10 else ""
    convert_command = f"python -m spacy convert {file_path} {output_directory} -c conllu -m --merge-subtokens {extra_args}"
    
    try:
        subprocess.run(convert_command.split(), check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error converting file '{os.path.basename(file_path)}':", e)

def process_conllu_files(input_directory, output_directory):
    """
    Processes all .conllu files in the input directory, validating and converting them.
    """
    for file_name in os.listdir(input_directory):
        if file_name.endswith(".conllu"):
            file_path = os.path.join(input_directory, file_name)
            
            print(f"\nProcessing file: {file_name}")
            sentences = read_and_parse_conllu(file_path)
            
            #if validate_head_indices(sentences):
            #    print(f"{file_name}: {len(sentences)} sentences - Head indices valid.")
            #    convert_to_spacy(file_path, output_directory, sentences)
            #else:
            #    print(f"{file_name}: Head indices validation failed. Conversion skipped.")
            #convert_to_spacy(file_path, output_directory, sentences)
            convert_to_spacy(file_path, output_directory, sentences)
# Example usage
#input_directory = "../assets/Lemmatization_training_files/test/"
#output_directory = "../assets/Lemmatization_training_files/test/"
#process_conllu_files(input_directory, output_directory)

In [10]:
input_directory = "../assets/Lemmatization_training_files/test/lemma_train/"
output_directory = "../assets/Lemmatization_training_files/test/lemma_train/spaCy/"
process_conllu_files(input_directory, output_directory)


Processing file: aristophanes_no_accents_NFKD_NFKD_dev.conllu
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (85 documents):
../assets/Lemmatization_training_files/test/lemma_train/spaCy/aristophanes_no_accents_NFKD_NFKD_dev.spacy[0m

Processing file: plato_iv_no_accents_NFKD_NFKC_train.conllu
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (25 documents):
../assets/Lemmatization_training_files/test/lemma_train/spaCy/plato_iv_no_accents_NFKD_NFKC_train.spacy[0m

Processing file: aristophanes_no_accents_NFKD_NFKD_train.conllu
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (337 documents):
../assets/Lemmatization_training_files/test/lemma_train/spaCy/aristophanes_no_accents_NFKD_NFKD_train.spacy[0m

Processing file: lucian_no_accents_NFKD_NFKD_train.conllu
[38;5;4mℹ Grouping every 10 sentences into a document.[0m
[38;5;2m✔ Generated output file (

In [None]:
import os
import conllu
import unicodedata

def process_conllu_files(input_directory, output_directory, normalization_form):
    """
    Processes .conllu files in the given directory, normalizes token forms and lemmas 
    using the specified normalization form, and writes them to a new directory.
    """
    # Ensure output directory exists
    os.makedirs(output_directory, exist_ok=True)

    for file_name in os.listdir(input_directory):
        if file_name.endswith(".conllu"):
            input_file_path = os.path.join(input_directory, file_name)
            with open(input_file_path, "r", encoding="utf-8") as file:
                annotations = file.read()

            sentences = conllu.parse(annotations)
            output_file_path = os.path.join(output_directory, f"{file_name[:-7]}_{normalization_form}.conllu")
            
            with open(output_file_path, "w", encoding="utf-8") as file:
                for sentence in sentences:
                    for token in sentence:
                        token["form"] = normalize_text(clean_text(token["form"]), normalization_form)
                        token["lemma"] = normalize_text(clean_text(token["lemma"]), normalization_form)
                    file.write(sentence.serialize())
            
            print(f"Processed file: {file_name}")

# Example usage:
#input_directory = "../assets/UD_Ancient_Greek-Perseus"
#output_directory_nfkd = "../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD"
#process_conllu_files(input_directory, output_directory_nfkd, 'NFKD')

#output_directory_nfkc = "../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC"
#process_conllu_files(input_directory, output_directory_nfkc, 'NFKC')

In [None]:
input_directory = "../assets/UD_Ancient_Greek-Perseus"
output_directory_nfkd = "../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD"
process_conllu_files(input_directory, output_directory_nfkd, 'NFKD')

output_directory_nfkc = "../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC"
process_conllu_files(input_directory, output_directory_nfkc, 'NFKC')

input_directory = "../assets/UD_Ancient_Greek-PROIEL"
output_directory_nfkd = "../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFK"
process_conllu_files(input_directory, output_directory_nfkd, 'NFKD')

output_directory_nfkc = "../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC"
process_conllu_files(input_directory, output_directory_nfkc, 'NFKC')

In [None]:
# convert conllu to spacy UD_Ancient_Greek and UD_Ancient_Greek-PROIEL
!python -m spacy convert ../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/ ../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/ -c conllu -m --n-sents 10 --merge-subtokens
!python -m spacy convert ../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC/ ../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC/ -c conllu -m --n-sents 10 --merge-subtokens

!python -m spacy convert ../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/ ../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/ -c conllu -m --n-sents 10 --merge-subtokens
!python -m spacy convert ../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC/ ../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC/ -c conllu -m --n-sents 10 --merge-subtokens



# Dataset tests

In [None]:
lemma_train= DocBin().from_disk('../corpus/train/lemma_train/lemma_train.spacy')
lemma_train_docs = list(lemma_train.get_docs(nlp.vocab))

PROIEL_NFKD= DocBin().from_disk('../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKD/grc_proiel-ud-train_NFKD.spacy')
PROIEL_NFKD_docs = list(PROIEL_NFKD.get_docs(nlp.vocab))

PROIEL_NFKC= DocBin().from_disk('../assets/UD_Ancient_Greek-PROIEL/UD_Ancient_Greek-PROIEL_NFKC/grc_proiel-ud-train_NKFC.spacy')
PROIEL_NFKC_docs = list(PROIEL_NFKC.get_docs(nlp.vocab))

Perseus_NFKD= DocBin().from_disk('../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKD/grc_perseus-ud-train_NFKD.spacy')
Perseus_NFKD_docs = list(Perseus_NFKD.get_docs(nlp.vocab))

Perseus_NFKC= DocBin().from_disk('../assets/UD_Ancient_Greek-Perseus/UD_Ancient_Greek-Perseus_NFKC/grc_perseus-ud-train_NFKC.spacy')
Perseus_NFKC_docs = list(Perseus_NFKC.get_docs(nlp.vocab))

In [None]:
# iterate through sentences in lemma_train. If a sentence is in any of the other files, print the sentence and the file it is in
for doc in lemma_train_docs:
    if doc not in PROIEL_NFKD_docs:
        print("PROIEL_NFKD")
        print(doc)
    if doc in PROIEL_NFKC_docs:
        print("PROIEL_NFKC")
        print(doc)
    if doc in Perseus_NFKD_docs:
        print("Perseus_NFKD")
        print(doc)
    if doc in Perseus_NFKC_docs:
        print("Perseus_NFKC")
        print(doc)
        

In [None]:
#Check sentence in lemma_train contain "XLV." if so, then print sentence
for doc in lemma_train_docs:
    if "XLV." in doc.text:
        print(doc.text)


