<a href="https://colab.research.google.com/github/ArtsARKADE/versemagic/blob/main/poems/Preprocess_Poems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

def preprocess_poem_files(input_dir):
    # Create output directory with timestamp
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_dir = os.path.join(input_dir, f"cleaned_up_poems_{timestamp}")
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over the main directory and all subdirectories
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.txt'):
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                output_path = os.path.join(output_dir, relative_path)
                output_subdir = os.path.dirname(output_path)
                os.makedirs(output_subdir, exist_ok=True)

                try:
                    with open(input_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                except UnicodeDecodeError:
                    try:
                        with open(input_path, 'r', encoding='ISO-8859-1') as f:
                            content = f.read()
                    except Exception as e:
                        print(f"Failed to read {file} with ISO-8859-1: {e}")
                        continue

                # Extract the title, name, and poem text
                name_match = re.search(r'^(.*?)(?:\n|$)', content)
                title_match = re.search(r'\n(?:.*\n)?(.*?)\n(?:\n|[^\n]*?)\n', content)
                poem_text_match = re.search(r'\n\n(.*?)\n*$', content, re.DOTALL)

                name = name_match.group(1).strip() if name_match else "Unknown Name"
                title = title_match.group(1).strip() if title_match else "Unknown Title"
                poem_text = poem_text_match.group(1).strip() if poem_text_match else "No poem text found"

                # Debugging statement to check if poem_text is found
                if poem_text and poem_text != "No poem text found":
                    print(f"Poem text found in file {file}")
                else:
                    print(f"No poem text found in file {file}")
                    poem_text = "No poem text found"

                # Write the cleaned content to the output directory
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(f"Name: {name}\n")
                    f.write(f"Title: {title}\n")
                    f.write(f"<poem>\n{poem_text}\n</poem>")

                print(f"Processed and saved {file} to {output_path}")

# UI Components for Directory Selection
input_dir_widget = widgets.Textarea(description="Input Directory:", layout={'width': '500px', 'height': '50px'})
preprocess_button = widgets.Button(description="Preprocess Poems")

def on_preprocess_button_clicked(b):
    input_dir = input_dir_widget.value
    if input_dir:
        preprocess_poem_files(input_dir)
    else:
        print("Please provide an input directory.")

preprocess_button.on_click(on_preprocess_button_clicked)

# Layout and Display for Preprocessing UI
preprocess_ui = widgets.VBox([
    input_dir_widget,
    preprocess_button
])

display(preprocess_ui)