In [6]:
import os
import re
from unidecode import unidecode

def processFile(filename, dataType, outputFile, label):
    """
    Processes a single file, cleans the text, and writes it to the output file with a label and filename.
    """
    input_path = os.path.join("data_raw", dataType, filename)
    with open(input_path, 'r', encoding='utf-8') as input_file:
        clean = []

        for line in input_file:
            # Clean and normalize the text
            line = re.sub(r'[A-Za-z]+', '', line)
            line = unidecode(line)
            line = line.strip().replace('-', ' ')
            line = re.sub(r'[^A-Za-z ]+', '', line)
            line = re.sub(' +', ' ', line)

            words = line.split()

            # Remove words in all caps (labels)
            for word in words:
                if len(word) > 1 and word[1].isupper():
                    continue
                if len(word) > 2 and word[2].isupper():
                    continue
                
                word = word.lower()
                forbidden = []
                if word not in forbidden:
                    clean.append(word)

    # Add label, filename, and cleaned text
    filename_clean = filename[:-4].replace(' ', '')
    formatted_text = f"{label} {filename_clean} " + ' '.join(clean)

    # Write to the output file
    with open(outputFile, 'a', encoding='utf-8') as output_file:
        output_file.write(formatted_text + '\n')

def clean(dataType, outputFile, label=None):
    """
    Processes all files for a given data type and writes them to a single output file.
    """
    for filename in os.listdir(os.path.join("data_raw", dataType)):
        filepath = os.path.join("data_raw", dataType, filename)
        if os.path.isfile(filepath):
            processFile(filename, dataType, outputFile, label if label is not None else 0)

In [7]:
combinedFile = "data_clean/combined.txt"
dubiaFile = "data_clean/dubia.txt"

if not os.path.exists("data_clean"):
    os.mkdir("data_clean")

open(combinedFile, 'w').close()
open(dubiaFile, 'w').close()

clean("plato", combinedFile, label=1)
clean("notplato", combinedFile, label=0)
clean("dubia", dubiaFile, label=-1)