In [None]:
import os
import re

# Original and new folders
original_folder = 'txt_files'
preprocessed_folder = 'final_data'

# Create the new folder if it doesn't exist
if not os.path.exists(preprocessed_folder):
    os.makedirs(preprocessed_folder)

armenian_letters_start = 0x0531  # Unicode code point for 'Ա' (Armenian Capital Letter Ayb)
armenian_letters_end = 0x0587    # Unicode code point for 'և' (Armenian Small Ligature Ech Yiwn)

armenian_punctuation_and_numbers = [
    0x058A,  # Armenian Hyphen
    0x0589,  # Armenian Full Stop (։)
    0x055A,  # Armenian Apostrophe (՝)
    0x055B,  # Armenian Emphasis Mark (՛)
    0x055C,  # Armenian Exclamation Mark (՜)
    0x055D,  # Armenian Question Mark (՞)
    0x055E,  # Armenian Semicolon (՟)
    0x055F,  # Armenian Paragraph Separator (ֈ)
    # Armenian Numbers
    0x0030, 0x0031, 0x0032, 0x0033, 0x0034,  # 0-4
    0x0035, 0x0036, 0x0037, 0x0038, 0x0039,  # 5-9
    # Space, Colon, English punctuation, and Dash types
    ord(' '), ord(':'), ord(','), ord('.'), ord('!'), ord('?'),
    0x002D,  # Hyphen-Minus (-)
    0x2013,  # En Dash (–)
    0x2014,  # Em Dash (—)
]

# Create a set to store the Armenian characters, punctuation, and numbers
armenian_characters = set()

# Add Armenian letters to the set
for codepoint in range(armenian_letters_start, armenian_letters_end + 1):
    armenian_characters.add(chr(codepoint))

# Add punctuation, numbers, and dashes to the set
for codepoint in armenian_punctuation_and_numbers:
    armenian_characters.add(chr(codepoint))

# Function to check if the text has at least one uppercase letter that is not the first or last character
def has_valid_uppercase(text):
    for i in range(1, len(text) - 1):
        if text[i].isupper():
            return True
    return False

# Function to preprocess a single file
def preprocess_file(file_path, output_folder):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Keep only Armenian characters, punctuation, and numbers
    cleaned_chars = [char for char in content if char in armenian_characters]
    cleaned_text = ''.join(cleaned_chars)

    # Split the text by Armenian full stop (։) and common English sentence delimiters (.:!?)
    segments = re.split(r'[։:.:!?]', cleaned_text)  # Armenian full stop and English punctuation

    # Save each segment into a new file if conditions are met
    for idx, segment in enumerate(segments):
        cleaned_segment = segment.strip()
        # Check if the segment has more than 12 characters and at least one valid uppercase letter
        if len(cleaned_segment) > 12 and has_valid_uppercase(cleaned_segment):
            # Generate a unique filename
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            new_filename = f'{base_name}_{idx}.txt'
            new_file_path = os.path.join(output_folder, new_filename)
            with open(new_file_path, 'w', encoding='utf-8') as new_file:
                new_file.write(cleaned_segment)
            print(f'Saved preprocessed segment to {new_filename}')

# Process each file in the original folder
for filename in os.listdir(original_folder):
    file_path = os.path.join(original_folder, filename)
    if os.path.isfile(file_path):
        preprocess_file(file_path, preprocessed_folder)