# Crossword data generation
- `./raw_data` contains ~9000 historical crosswords from the NYT
- Aim is to create crossword images and solutions to train ML models.

# Flatten NYT data 

In [3]:
from flatten_directory import flatten_directory
flatten_directory("../raw_data", "flattened")
print("Flattened directory structure of raw_data into flattened/")

Flattened directory structure of raw_data into flattened/


# Render crosswords

In [6]:
from pathlib import Path
import random
import puz
from render_crossword import render_crossword

# CONFIG
PUZ_DIR = "flattened"  # folder containing .puz files
OUT_IMG_DIR = "../dataset/images"
OUT_MASK_DIR = "../dataset/masks"
OUT_SOLUTION_DIR = "../dataset/solutions"

files = list(Path(PUZ_DIR).rglob("*.puz"))
print(f"Found {len(files)} .puz files")

# Take a random subset of no more than:
MAX_FILES = 100
if len(files) > MAX_FILES:
    files = random.sample(files, MAX_FILES)
print(f"Processing {len(files)} files (random subset)")

last_printed_percent = 0

for i, file in enumerate(files, 1):  # start at 1
    try:
        puzzle = puz.read(str(file))
        stem = file.stem
        render_crossword(puzzle, stem, OUT_IMG_DIR, OUT_MASK_DIR, OUT_SOLUTION_DIR)
        message = f"Processed {file.name}"
    except Exception as e:
        message = f"Error parsing {file.name}: {e}"

    percent = (i / len(files)) * 100

    # Only print if we've reached the next 10% increment or it's the last file
    if percent - last_printed_percent >= 10 or i == len(files):
        print(f"[{i}/{len(files)}] ({percent:.1f}%) {message}")
        last_printed_percent = percent


Found 8914 .puz files
Processing 100 files (random subset)
[10/100] (10.0%) Processed daily-1997-11-Nov0397.puz
[20/100] (20.0%) Processed daily-2006-04-Apr0106.puz
[30/100] (30.0%) Processed daily-1999-12-Dec1699.puz
[40/100] (40.0%) Processed daily-2003-10-Oct2203.puz
[50/100] (50.0%) Processed variety-2007-Aug1207.2.puz
[60/100] (60.0%) Processed daily-1994-05-May2994.puz
[70/100] (70.0%) Processed daily-2008-05-May1308.puz
[80/100] (80.0%) Processed daily-2011-08-Aug1811.puz
[90/100] (90.0%) Processed daily-1999-03-Mar2099.puz
[100/100] (100.0%) Processed daily-1999-11-Nov2999.puz
