In [6]:
from pathlib import Path
import chardet
import re

In [7]:


book_file = Path(r'/Users/jeremy/Development/family-altar-book-processor/Book 1 Complete.txt')
output_folder = Path("processed_books")
output_folder.mkdir(exist_ok=True)  # Create folder if it doesn't exist
output_file = output_folder / "text-file-creation-test.txt"

def read_book(file_path: Path) -> str:
    """Read the book text into a string, auto-detecting encoding"""
    raw_data = file_path.read_bytes()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    confidence = result['confidence']
    print(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
    text = raw_data.decode(encoding).strip()
    # Remove extra blank lines
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)

if __name__ == "__main__":
    if book_file.exists():
        text = read_book(book_file)
        with output_file.open("w", encoding="utf-8") as f:
            f.write(text)
    else:
        print(f"File not found: {book_file}")


Detected encoding: UTF-16 (confidence: 1.00)


In [8]:


def find_family_altar_lines(text: str):
    """Find all lines like 'Day {number}    The Family Altar    {Month} {number}'"""
    pattern = re.compile(
        r'Day\s+\d+\s+The Family Altar\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+',
        re.IGNORECASE
    )
    return pattern.findall(text), pattern.findall(text, re.MULTILINE)  # returns matches

if __name__ == "__main__":
    if book_file.exists():
        text = read_book(book_file)
        matches = re.findall(
            r'Day\s+\d+\s+The Family Altar\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+',
            text
        )
        print("Found lines:")
        for line in matches:
            print(line)
    else:
        print(f"File not found: {book_file}")


Detected encoding: UTF-16 (confidence: 1.00)
Found lines:
Day 1                        The Family Altar                     January 1
Day 2                        The Family Altar                     January 2
Day 3                        The Family Altar                     January 3
Day 4                        The Family Altar                     January 4
Day 5                        The Family Altar                     January 5
Day 6                        The Family Altar                     January 6
Day 7                        The Family Altar                     January 7
Day 8                        The Family Altar                     January 8
Day 9                        The Family Altar                     January 9
Day 10                       The Family Altar                    January 10
Day 11                       The Family Altar                    January 11
Day 12                       The Family Altar                    January 12
Day 13                       T

In [10]:
import re
from pathlib import Path

def save_days_to_files(book_text: str, output_folder: Path):
    """
    Split the Family Altar book text into daily sections and save each one as a separate file.
    Each file is named after the date on that page (e.g. 'January_6.txt').
    """
    output_folder.mkdir(exist_ok=True)

    # Pattern to match the start of each day's entry
    pattern = re.compile(
        r'(Day\s+\d+\s+The Family Altar\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d+)',
        re.IGNORECASE
    )

    # Find all start positions of each day's heading
    matches = list(pattern.finditer(book_text))

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(book_text)

        # Extract the day's full text block
        day_text = book_text[start:end].strip()

        # Extract the month and day number for naming
        heading = match.group(1)
        date_match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d+)', heading)
        if date_match:
            month = date_match.group(1)
            day = date_match.group(2)
            filename = f"{month}_{day}.txt"
        else:
            filename = f"day_{i+1}.txt"

        # Save the section to a file
        output_path = output_folder / filename
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(day_text)

        print(f"Saved: {filename}")

if __name__ == "__main__":
    output_folder = Path("days")

    if book_file.exists():
        text = read_book(book_file)
        save_days_to_files(text, output_folder)
    else:
        print(f"File not found: {book_file}")

Detected encoding: UTF-16 (confidence: 1.00)
Saved: January_1.txt
Saved: January_2.txt
Saved: January_3.txt
Saved: January_4.txt
Saved: January_5.txt
Saved: January_6.txt
Saved: January_7.txt
Saved: January_8.txt
Saved: January_9.txt
Saved: January_10.txt
Saved: January_11.txt
Saved: January_12.txt
Saved: January_13.txt
Saved: January_14.txt
Saved: January_15.txt
Saved: January_16.txt
Saved: January_17.txt
Saved: January_18.txt
Saved: January_19.txt
Saved: January_20.txt
Saved: January_21.txt
Saved: January_22.txt
Saved: January_23.txt
Saved: January_24.txt
Saved: January_25.txt
Saved: January_26.txt
Saved: January_27.txt
Saved: January_28.txt
Saved: January_29.txt
Saved: January_30.txt
Saved: January_31.txt
Saved: February_1.txt
Saved: February_2.txt
Saved: February_3.txt
Saved: February_4.txt
Saved: February_5.txt
Saved: February_6.txt
Saved: February_7.txt
Saved: February_8.txt
Saved: February_9.txt
Saved: February_10.txt
Saved: February_11.txt
Saved: February_12.txt
Saved: February