- Use WSL Ubuntu 24

# Flatten NYT data 

In [2]:
from flatten_directory import flatten_directory
flatten_directory("raw_data", "flattened")
print("Flattened directory structure of raw_data into flattened/")

Flattened directory structure of raw_data into flattened/


# Render crosswords

In [2]:
from pathlib import Path
import random
import puz
from render_crossword import render_crossword

# CONFIG
PUZ_DIR = "flattened"  # folder containing .puz files
OUT_IMG_DIR = "dataset/images"
OUT_MASK_DIR = "dataset/masks"
OUT_SOLUTION_DIR = "dataset/solutions"

files = list(Path(PUZ_DIR).glob("*.puz"))
print(f"Found {len(files)} .puz files")

# Take a random subset of no more than:
MAX_FILES = 500
if len(files) > MAX_FILES:
    files = random.sample(files, MAX_FILES)
print(f"Processing {len(files)} files (random subset)")

last_printed_percent = 0

for i, file in enumerate(files, 1):  # start at 1
    try:
        puzzle = puz.read(str(file))
        stem = file.stem
        render_crossword(puzzle, stem, OUT_IMG_DIR, OUT_MASK_DIR, OUT_SOLUTION_DIR)
        message = f"Processed {file.name}"
    except Exception as e:
        message = f"Error parsing {file.name}: {e}"

    percent = (i / len(files)) * 100

    # Only print if we've reached the next 10% increment or it's the last file
    if percent - last_printed_percent >= 10 or i == len(files):
        print(f"[{i}/{len(files)}] ({percent:.1f}%) {message}")
        last_printed_percent = percent


Found 8914 .puz files
Processing 500 files (random subset)
[50/500] (10.0%) Processed daily-1997-08-Aug1597.puz
[100/500] (20.0%) Processed daily-2006-10-Oct1806.puz
[150/500] (30.0%) Processed daily-2007-03-Mar1507.puz
[200/500] (40.0%) Processed daily-2005-04-Apr2305.puz
[250/500] (50.0%) Processed daily-2014-02-Feb1914.puz
[300/500] (60.0%) Processed daily-1998-10-Oct1198.puz
[350/500] (70.0%) Processed daily-1993-12-Dec2493.puz
[400/500] (80.0%) Processed daily-2012-10-Oct1212.puz
[450/500] (90.0%) Processed daily-2000-08-Aug1000.puz
[500/500] (100.0%) Processed daily-2015-10-Oct1215.puz
