<a href="https://colab.research.google.com/github/1ucky40nc3/letter_league/blob/main/search_proposals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Load the Corpus

!git clone https://github.com/dwyl/english-words.git
%cd english-words

with open("words.txt", "r", encoding="utf-8") as f:
    text = f.read()

Cloning into 'english-words'...
remote: Enumerating objects: 248, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 248 (delta 97), reused 83 (delta 83), pack-reused 123[K
Receiving objects: 100% (248/248), 27.63 MiB | 18.84 MiB/s, done.
Resolving deltas: 100% (117/117), done.
/content/english-words


In [2]:
# @title Implement the Search Algorithm

from typing import Any
import re

from typing import (
    Any,
    Dict,
    Tuple, 
    Union,
    Generator, 
    Optional
)
from collections.abc import Sequence

import re
import itertools


Query = Sequence[Union[str, None]]
Counts = Dict[str, int]


def num_none(seq: Sequence[Union[str, None]]) -> int:
    return len(seq) - len(list(filter(None, seq)))


def corpus_item(match: re.Match, corpus: str) -> str:
    start, end = match.span()
    while start - 1 > 0 and "\n" not in corpus[start - 1:start + 1]:
        start -= 1
    while end + 1 < len(corpus) and "\n" not in corpus[end - 1:end + 1]:
        end += 1
    return corpus[start:end]


def build_regex(query: Query, options: Tuple[str]) -> str:
    option_regex = f"[{''.join(options)}]"
    query_regex = "".join([q if isinstance(q, str) else option_regex for q in query])
    n_options = str(len(options) - num_none(query))
    options_regex = option_regex + "{0," + n_options + "}"
    regex = f"{options_regex}{query_regex}{options_regex}"
    return regex


def char_counts(string: str) -> Counts:
    chars = set(string)
    counts = {char: string.count(char) for char in chars}
    return counts


def le_counts(source: Counts, target: Counts) -> bool:
    return (
        all(k in target for k in source.keys())
        and all([source[k] <= target[k] for k in source.keys()])
    )


def scrabble(query: Query, options: Tuple[str], corpus: str) -> Generator[str, None, None]:
    regex = build_regex(query, options)
    pattern = re.compile(regex)

    min_length = len(query)
    max_length = len(options) + len(query) - num_none(query) + 1
    query_and_options_counts = char_counts("".join(list(filter(None, query))) + "".join(options))

    proposals = set()
    for match_obj in pattern.finditer(corpus):
        proposal = corpus_item(match_obj, corpus)
        if (
            proposal not in proposals
            and min_length < len(proposal) < max_length
            and le_counts(char_counts(proposal), query_and_options_counts)
        ):
            proposals.add(proposal)
            yield proposal

In [3]:
# @title Search!

queries = [("h"), ("n", None, "a")]
options = ("e", "t", "r", "f", "a")
for query in queries:
    for string in scrabble(query, options, text):
        print(f'{query=} | {string=}')

NameError: ignored