In [1]:
noun = ['με', 'μας',
'σε', 'σας',
'τον', 'τους',
'την', 'τις',
'το', 'τα',
'μου', 'μας',
'σου', 'σας',
'του', 'τους',
'της', 'τους',
'του', 'τους',
'εγώ', 'εμείς',
'εσύ', 'εσείς',
'αυτός', 'αυτοί',
'αυτή', 'αυτές',
'αυτό', 'αυτά']
numerals = [
    "ένα",
    "δύο",
    "τρία",
    "τέσσερα",
    "πέντε",
    "έξι",
    "επτά",
    "οκτώ",
    "εννέα",
    "δέκα",
    "είκοσι",
    "τριάντα",
    "σαράντα",
    "πενήντα",
    "εξήντα",
    "εβδομήντα",
    "ογδόντα",
    "ενενήντα",
    "εκατό",
    "διακόσια",
    "τριακόσια",
    "τετρακόσια",
    "πεντακόσια",
    "εξακόσια",
    "επτακόσια",
    "οκτακόσια",
    "εννιακόσια",
    "χίλια",
    "δέκα χιλιάδες",
    "εκατό χιλιάδες",
    "ένα εκατομμύριο"
]
words_to_remove = noun + numerals

In [2]:
import os
def get_file_lengths(directory):
    file_lengths = {}
    
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    length = len(content)
                    file_lengths[filename] = length
    
    return file_lengths

directory = 'pure'
file_lengths = get_file_lengths(directory)

In [3]:
import spacy
import grecy
import el_core_news_lg

nlp_news = el_core_news_lg.load(disable=['parser'])
nlp_grc1 = spacy.load('grecy/grc_perseus_trf', disable=['parser'])
nlp_grc2 = spacy.load('grecy/grc_proiel_trf', disable=['parser'])

nlp_news.max_length = 1000000
nlp_grc1.max_length = 1000000
nlp_grc2.max_length = 1000000

pure_folder = 'example'
processed_folder_news = 'example_processed_news_test'
processed_folder_perseus = 'example_processed_perseus'
processed_folder_proiel = 'example_processed_proiel'
combined_processed_folder = 'example_combined_processed'


If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current 'transformers' and 'spacy-transformers' versions. For more details and available updates, run: python -m spacy validate


In [4]:
def prepare_greek_text(nlp, texts):
    pos_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}
    combined_text = ' br '.join(texts)
    nlp_doc = nlp(combined_text)
    
    res = []
    doc = []
    
    for token in nlp_doc:
        if token.text in words_to_remove:
            continue
        
        if token.pos_ in pos_dict:
            doc.append(pos_dict[token.pos_])
        elif token.lemma_.isdigit():
            doc.append('ordinal1')
        elif token.pos_ != 'PUNCT':
            doc.append(token.lemma_.lower())
        if token.text == 'br':
            res.append(doc)
            doc = []
    
    if doc:
        res.append(doc)
    
    return res

def process_files_in_batches(pure_folder, processed_folder, nlp, model_name, batch_size=3):
    if not os.path.exists(processed_folder):
        os.makedirs(processed_folder)

    texts_batch = []
    filenames_batch = []
    
    for filename in os.listdir(pure_folder):
        if filename.endswith('.txt'):
            input_file = os.path.join(pure_folder, filename)
            with open(input_file, 'r') as fin:
                texts_batch.append(fin.read().strip())
                filenames_batch.append(filename)

            if len(texts_batch) >= batch_size:
                print(f"Processing batch with {model_name} -> {processed_folder}")
                processed_texts = prepare_greek_text(nlp, texts_batch)

                output_contents = []
                for i in range(len(processed_texts)):
                    output_contents.append(' '.join(processed_texts[i]))

                for i, output_content in enumerate(output_contents):
                    output_file = os.path.join(processed_folder, filenames_batch[i])
                    with open(output_file, 'w') as fout:
                        fout.write(output_content + '\n')

                texts_batch.clear()
                filenames_batch.clear()

    if texts_batch:
        print(f"Processing remaining texts with {model_name} -> {processed_folder}")
        processed_texts = prepare_greek_text(nlp, texts_batch)

        output_contents = []
        for i in range(len(processed_texts)):
            output_contents.append(' '.join(processed_texts[i]))

        for i, output_content in enumerate(output_contents):
            output_file = os.path.join(processed_folder, filenames_batch[i])
            with open(output_file, 'w') as fout:
                fout.write(output_content + '\n')

def process_text_through_all_models(text, nlp_news, nlp_grc1, nlp_grc2):
    processed_texts_news = prepare_greek_text(nlp_news, [text])
    processed_text_news = ' '.join(processed_texts_news[0]) if processed_texts_news else ''
    
    processed_texts_perseus = prepare_greek_text(nlp_grc1, [processed_text_news])
    processed_text_perseus = ' '.join(processed_texts_perseus[0]) if processed_texts_perseus else ''
    
    processed_texts_proiel = prepare_greek_text(nlp_grc2, [processed_text_perseus])
    processed_text_proiel = ' '.join(processed_texts_proiel[0]) if processed_texts_proiel else ''
    
    return processed_text_proiel

def process_files_through_all_models(pure_folder, combined_processed_folder, nlp_news, nlp_grc1, nlp_grc2, batch_size=1):
    if not os.path.exists(combined_processed_folder):
        os.makedirs(combined_processed_folder)

    texts_batch = []
    filenames_batch = []

    for filename in os.listdir(pure_folder):
        if filename.endswith('.txt'):
            input_file = os.path.join(pure_folder, filename)
            with open(input_file, 'r') as fin:
                texts_batch.append(fin.read().strip())
                filenames_batch.append(filename)
            if len(texts_batch) >= batch_size:
                print(f"Processing batch through all models -> {combined_processed_folder}")

                for i, text in enumerate(texts_batch):
                    final_processed_text = process_text_through_all_models(text, nlp_news, nlp_grc1, nlp_grc2)
                    
                    output_file = os.path.join(combined_processed_folder, filenames_batch[i])
                    with open(output_file, 'w') as fout:
                        fout.write(final_processed_text + '\n')

                texts_batch.clear()
                filenames_batch.clear()

    if texts_batch:
        print(f"Processing remaining texts through all models -> {combined_processed_folder}")

        for i, text in enumerate(texts_batch):
            final_processed_text = process_text_through_all_models(text, nlp_news, nlp_grc1, nlp_grc2)
            
            output_file = os.path.join(combined_processed_folder, filenames_batch[i])
            with open(output_file, 'w') as fout:
                fout.write(final_processed_text + '\n')

In [7]:
pure_folder = 'example'
processed_folder_news = 'example_processed_news_test'
processed_folder_perseus = 'example_processed_perseus'
processed_folder_proiel = 'example_processed_proiel'
combined_processed_folder = 'example_combined_processed'
process_files_in_batches(pure_folder, processed_folder_news, nlp_news, 'news')
process_files_in_batches(pure_folder, processed_folder_perseus, nlp_grc1, 'perseus')
process_files_in_batches(pure_folder, processed_folder_proiel, nlp_grc2, 'proiel')

process_files_through_all_models(pure_folder, combined_processed_folder, nlp_news, nlp_grc1, nlp_grc2)

Processing remaining texts with news -> example_processed_news_test
Processing remaining texts with perseus -> example_processed_perseus
Processing remaining texts with proiel -> example_processed_proiel
Processing batch through all models -> example_combined_processed


In [11]:
def prepare_greek_text(nlp, texts):
    pos_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}
    combined_text = ' br '.join(texts)
    nlp_doc = nlp(combined_text)
    
    res = []
    doc = []
    
    for token in nlp_doc:
        if token.text in words_to_remove:
            continue
        
        if token.pos_ in pos_dict:
            doc.append(pos_dict[token.pos_])
        elif token.lemma_.isdigit():
            doc.append('ordinal1')
        elif token.pos_ != 'PUNCT':
            doc.append(token.lemma_.lower())
        
        if token.text == 'br':
            res.append(doc)
            doc = []
    
    if doc:
        res.append(doc)
    
    return res

def process_text_through_all_models(text, nlp_news, nlp_grc1, nlp_grc2):
    processed_texts_news = prepare_greek_text(nlp_news, [text])
    processed_text_news = ' '.join(processed_texts_news[0]) if processed_texts_news else ''
    
    processed_texts_perseus = prepare_greek_text(nlp_grc1, [processed_text_news])
    processed_text_perseus = ' '.join(processed_texts_perseus[0]) if processed_texts_perseus else ''
    
    processed_texts_proiel = prepare_greek_text(nlp_grc2, [processed_text_perseus])
    processed_text_proiel = ' '.join(processed_texts_proiel[0]) if processed_texts_proiel else ''
    
    return processed_text_proiel

def split_text_into_chunks(text, max_length=1000000):
    words = text.split()
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word) + 1
        if current_length + word_length > max_length:
            yield ' '.join(current_chunk)
            current_chunk = []
            current_length = 0
        current_chunk.append(word)
        current_length += word_length
    
    if current_chunk:
        yield ' '.join(current_chunk)

def process_files_through_all_models(pure_folder, combined_processed_folder, nlp_news, nlp_grc1, nlp_grc2, batch_size=1):
    if not os.path.exists(combined_processed_folder):
        os.makedirs(combined_processed_folder)
    
    filenames = [f for f in os.listdir(pure_folder) if f.endswith('.txt')]
    
    for filename in filenames:
        input_file = os.path.join(pure_folder, filename)
        output_file = os.path.join(combined_processed_folder, filename)
        
        with open(input_file, 'r', encoding='utf-8') as fin:
            text = fin.read()
        
        if len(text) <= 1000000:
            final_processed_text = process_text_through_all_models(text, nlp_news, nlp_grc1, nlp_grc2)
            with open(output_file, 'w', encoding='utf-8') as fout:
                fout.write(final_processed_text + '\n')
        else:
            chunks = list(split_text_into_chunks(text))
            
            final_processed_parts = []
            for i, chunk in enumerate(chunks):
                print(f"Processing part {i+1} of {len(chunks)} for file {filename}")
                processed_chunk = process_text_through_all_models(chunk, nlp_news, nlp_grc1, nlp_grc2)
                final_processed_parts.append(processed_chunk)
            
            final_processed_text = ' '.join(final_processed_parts)
            
            with open(output_file, 'w', encoding='utf-8') as fout:
                fout.write(final_processed_text + '\n')

In [8]:
pure_folder = 'pure'
processed_folder_news = 'greek_lemmatized_news'
processed_folder_perseus = 'greek_lemmatized_perseus'
processed_folder_proiel = 'greek_lemmatized_proiel'
combined_processed_folder = 'greek_lemmatized_combined'

# process_files_in_batches(pure_folder, processed_folder_news, nlp_news, 'news')
# process_files_in_batches(pure_folder, processed_folder_perseus, nlp_grc1, 'perseus')
# process_files_in_batches(pure_folder, processed_folder_proiel, nlp_grc2, 'proiel')

process_files_through_all_models(pure_folder, combined_processed_folder, nlp_news, nlp_grc1, nlp_grc2)

KeyboardInterrupt: 