In [15]:
import os
import re
from trankit import Pipeline

input_folder = 'example'
output_folders = {
    'ancient-greek': 'lemmatized_trankit_ag',
    'ancient-greek-perseus': 'lemmatized_trankit_agp',
    'greek': 'lemmatized_trankit_g',
    'all': 'lemmatized_trankit_all'
}

pipelines = {
    'ancient-greek': Pipeline('ancient-greek'),
    'ancient-greek-perseus': Pipeline('ancient-greek-perseus'),
    'greek': Pipeline('greek')
}

noun = ['με', 'μας', 'σε', 'σας', 'τον', 'τους', 'την', 'τις', 'το', 'τα', 'μου', 'μας', 'σου', 'σας', 'του', 'τους',
        'της', 'τους', 'του', 'τους', 'εγώ', 'εμείς', 'εσύ', 'εσείς', 'αυτός', 'αυτοί', 'αυτή', 'αυτές', 'αυто', 'αυτά']
numerals = [
    "ένα", "δύο", "τρία", "τέσσερα", "πέντε", "έξι", "επτά", "οκτώ", "εννέα", "δέκα", "είκοσι", "τριάντα",
    "σαράντα", "πενήντα", "εξήντα", "εβδομήντα", "ογδόντа", "ενενήντα", "εκατό", "διακόσια", "τριακόσια",
    "τετρακόσια", "πεντακόσια", "εξακόσια", "επτακόσια", "οκτακόσια", "εννιακόσια", "χίλια", "δέκα χιλιάδες",
    "εκατό χιλιάδες", "ένα εκατομμύριο"
]
words_to_remove = noun + numerals

tonos_replacements = {
        'ά': 'α',
        'έ': 'ε',
        'ή': 'η',
        'ί': 'ι',
        'ό': 'ο',
        'ύ': 'υ',
        'ώ': 'ω',
        'Ά': 'Α',
        'Έ': 'Ε',
        'Ή': 'Η',
        'Ί': 'Ι',
        'Ό': 'Ο',
        'Ύ': 'Υ',
        'Ώ': 'Ω'}

def remove_tonos_greek(text):
    result = text
    for tonos_char, plain_char in tonos_replacements.items():
        result = result.replace(tonos_char, plain_char)
    
    return result

for i in range(len(words_to_remove)):
    words_to_remove[i] = remove_tonos_greek(words_to_remove[i])
    
label_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}

Loading pretrained XLM-Roberta, this may take a while...




Loading tokenizer for ancient-greek
Loading tagger for ancient-greek
Loading lemmatizer for ancient-greek
Active language: ancient-greek
Loading pretrained XLM-Roberta, this may take a while...
Loading tokenizer for ancient-greek-perseus
Loading tagger for ancient-greek-perseus
Loading lemmatizer for ancient-greek-perseus
Active language: ancient-greek-perseus
Loading pretrained XLM-Roberta, this may take a while...
Loading tokenizer for greek
Loading tagger for greek
Loading multi-word expander for greek
Loading lemmatizer for greek
Active language: greek


In [16]:
for folder in output_folders.values():
    os.makedirs(folder, exist_ok=True)

def clean_text(text):
    cleaned_text = re.sub(r'[^α-ωΑ-Ω0-9\s\.,;:!?\(\)\[\]\{\}\'"«»\-]', '', text)
    return cleaned_text

def remove_words_to_remove(text, words_to_remove):
    return ' '.join(word for word in text.split() if word not in words_to_remove)

def process_text_with_trankit(text, pipeline):
    processed_text = pipeline(text)
    prepared_text = []
    
    for sentence in processed_text['sentences']:
        for token in sentence['tokens']:
            upos = token.get('upos', None)  
            if upos is None:
                continue

            if upos in label_dict:
                prepared_text.append(label_dict[upos])
            elif upos != 'PUNCT':  
                lemma = token.get('lemma', token['text']).lower()
                if lemma not in words_to_remove:
                    prepared_text.append(lemma)
    
    return ' '.join(prepared_text)

for filename in os.listdir(input_folder):
    if filename.endswith('.txt'):
        input_path = os.path.join(input_folder, filename)
        
        with open(input_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        text = remove_tonos_greek(text)
        text = clean_text(text)
        
        for lang_code, pipeline in pipelines.items():
            processed_text = process_text_with_trankit(text, pipeline)
            
            output_path = os.path.join(output_folders[lang_code], filename)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(processed_text)
        
        combined_text = text
        for pipeline in pipelines.values():
            combined_text = process_text_with_trankit(combined_text, pipeline)
        
        combined_output_path = os.path.join(output_folders['all'], filename)
        with open(combined_output_path, 'w', encoding='utf-8') as combined_output_file:
            combined_output_file.write(combined_text)

Лемматизация и обработка завершены.
