In [3]:
import os
import re
import stanza

input_folder = 'example'
output_folders = {
    'ancient-greek-proiel': 'exam_lemmatized_stanza_ag',
    'ancient-greek-perseus': 'exam_lemmatized_stanza_agp',
    'greek-1': 'exam_lemmatized_stanza_g_1',
    'greek-2': 'exam_lemmatized_stanza_g_2',
    'all': 'exam_lemmatized_stanza_all'
}

pipelines = {
    'ancient-greek-perseus': stanza.Pipeline(
        lang = 'grc',
        processors='tokenize, pos, lemma',
        model_dir = 'stanza_resources',
        download_method = None,
        use_gpu = True
    ),
    'ancient-greek-proiel': stanza.Pipeline(
        lang = 'grc', 
        processors='tokenize, pos, lemma', 
        package = 'proiel',
        model_dir = 'stanza_resources',
        download_method = None,
        use_gpu = True
    ),
    'greek-1': stanza.Pipeline(
        lang = 'el', 
        processors='tokenize, mwt, pos, lemma',
        model_dir = 'stanza_resources',
        download_method = None,
        use_gpu = True
    ),
    'greek-2': stanza.Pipeline(
        lang = 'el', 
        processors='tokenize, pos, lemma', 
        package = 'gud',
        model_dir = 'stanza_resources',
        download_method = None,
        use_gpu = True
    )
}


2025-03-14 07:58:41 INFO: Loading these models for language: grc (Ancient_Greek):
| Processor | Package          |
--------------------------------
| tokenize  | perseus          |
| pos       | perseus_nocharlm |
| lemma     | perseus_nocharlm |

2025-03-14 07:58:41 INFO: Using device: cpu
2025-03-14 07:58:41 INFO: Loading: tokenize
2025-03-14 07:58:41 INFO: Loading: pos
2025-03-14 07:58:41 INFO: Loading: lemma
2025-03-14 07:58:41 INFO: Done loading processors!
2025-03-14 07:58:41 INFO: Loading these models for language: grc (Ancient_Greek):
| Processor | Package         |
-------------------------------
| tokenize  | proiel          |
| pos       | proiel_nocharlm |
| lemma     | proiel_nocharlm |

2025-03-14 07:58:41 INFO: Using device: cpu
2025-03-14 07:58:41 INFO: Loading: tokenize
2025-03-14 07:58:41 INFO: Loading: pos
2025-03-14 07:58:41 INFO: Loading: lemma
2025-03-14 07:58:41 INFO: Done loading processors!
2025-03-14 07:58:41 INFO: Loading these models for language: el (Greek)

In [4]:
noun = ['με', 'μας', 'σε', 'σας', 'τον', 'τους', 'την', 'τις', 'το', 'τα', 'μου', 'μας', 'σου', 'σας', 'του', 'τους',
        'της', 'τους', 'του', 'τους', 'εγώ', 'εμείς', 'εσύ', 'εσείς', 'αυτός', 'αυτοί', 'αυτή', 'αυτές', 'αυто', 'αυτά']
numerals = [
    "ένα", "δύο", "τρία", "τέσσερα", "πέντε", "έξι", "επτά", "οκτώ", "εννέα", "δέκα", "είκοσι", "τριάντα",
    "σαράντα", "πενήντα", "εξήντα", "εβδομήντα", "ογδόντа", "ενενήντα", "εκατό", "διακόσια", "τριακόσια",
    "τετρακόσια", "πεντακόσια", "εξακόσια", "επτακόσια", "οκτακόσια", "εννιακόσια", "χίλια", "δέκα χιλιάδες",
    "εκατό χιλιάδες", "ένα εκατομμύριο"
]
words_to_remove = noun + numerals

tonos_replacements = {
        'ά': 'α',
        'έ': 'ε',
        'ή': 'η',
        'ί': 'ι',
        'ό': 'ο',
        'ύ': 'υ',
        'ώ': 'ω',
        'Ά': 'Α',
        'Έ': 'Ε',
        'Ή': 'Η',
        'Ί': 'Ι',
        'Ό': 'Ο',
        'Ύ': 'Υ',
        'Ώ': 'Ω'}

def remove_tonos_greek(text):
    result = text
    for tonos_char, plain_char in tonos_replacements.items():
        result = result.replace(tonos_char, plain_char)
    
    return result

for i in range(len(words_to_remove)):
    words_to_remove[i] = remove_tonos_greek(words_to_remove[i])
    
label_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}

In [15]:
for folder in output_folders.values():
    os.makedirs(folder, exist_ok=True)

def clean_text(text):
    cleaned_text = remove_tonos_greek(re.sub(r'[^α-ωΑ-Ω0-9\s\.,;:!?\(\)\[\]\{\}\'"«»\-]', '', text))
    return cleaned_text

In [16]:
def process_text_with_stanza(text, pipeline):
    processed_text = pipeline(text)
    prepared_text = []
    
    for sentence in processed_text.sentences:
        for word in sentence.words:
            upos = word.upos
            
            if upos is None:
                continue
            
            if upos in label_dict:
                prepared_text.append(label_dict[upos])
            elif upos != 'PUNCT':
                lemma = word.lemma.lower() if word.lemma else word.text.lower()
                if lemma not in words_to_remove:
                    prepared_text.append(lemma)
    
    return ' '.join(prepared_text)

In [17]:
for filename in os.listdir(input_folder):
    if filename.endswith('.txt'):
        input_path = os.path.join(input_folder, filename)
        
        with open(input_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        text = remove_tonos_greek(text)
        text = clean_text(text)
        
        for lang_code, pipeline in pipelines.items():
            processed_text = process_text_with_stanza(text, pipeline)
            
            output_path = os.path.join(output_folders[lang_code], filename)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(processed_text)
        
        combined_text = text
        for pipeline in pipelines.values():
            combined_text = process_text_with_stanza(combined_text, pipeline)
        
        combined_output_path = os.path.join(output_folders['all'], filename)
        with open(combined_output_path, 'w', encoding='utf-8') as combined_output_file:
            combined_output_file.write(combined_text)

In [14]:
label_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}
for folder in output_folders.values():
    os.makedirs(folder, exist_ok=True)

def load_documents_in_batches(input_folder, batch_size=100):
    filenames = [filename for filename in os.listdir(input_folder) if filename.endswith('.txt')]
    batches = []
    
    for i in range(0, len(filenames), batch_size):
        batch_filenames = filenames[i:i + batch_size]
        documents = []
        for filename in batch_filenames:
            input_path = os.path.join(input_folder, filename)
            with open(input_path, 'r', encoding='utf-8') as file:
                text = file.read().strip()
                text = remove_tonos_greek(text)
                text = clean_text(text)
            documents.append(stanza.Document([], text=text))
        batches.append((documents, batch_filenames))
    
    return batches

def process_documents(documents, pipeline):
    return pipeline(documents)

def extract_lemmas(doc):
    current_result = []
    for sentence in doc.sentences:
        for word in sentence.words:
            upos = word.upos
            
            if upos is None:
                continue
            
            if upos in label_dict:
                current_result.append(label_dict[upos])
            elif upos != 'PUNCT':
                lemma = word.lemma.lower() if word.lemma else word.text.lower()
                if lemma not in words_to_remove:
                    current_result.append(lemma)
    return ' '.join(current_result)

def process_all_documents_in_batches(batch_size=100):
    batches = load_documents_in_batches(input_folder, batch_size=batch_size)
    
    for documents, filenames in batches:
        processed_results = {lang_code: [] for lang_code in pipelines.keys()}
        for lang_code, pipeline in pipelines.items():
            out_docs = process_documents(documents, pipeline)
            processed_results[lang_code] = out_docs
        save_results(processed_results, filenames, output_folders)

def save_results(processed_results, filenames, output_folders):
    for lang_code, out_docs in processed_results.items():
        output_folder = output_folders[lang_code]

        for i, doc in enumerate(out_docs):
            processed_text = extract_lemmas(doc)
            output_path = os.path.join(output_folder, filenames[i])
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(processed_text)

process_all_documents_in_batches(batch_size=100)
