In [1]:
import random
import spacy

# --- spaCy model ---
# First time only: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def choose_two_nouns_from_label(
    gt_text_label: str,
    *,
    seed: int | None = None,
    prefer_distinct: bool = True,
) -> tuple[str, str]:
    """
    Takes a ground-truth text label (the full joke) and returns exactly TWO nouns.

    Strategy (cheap + robust):
    1) Extract noun / proper noun lemmas via spaCy.
    2) Filter trivial lemmas.
    3) Prefer two distinct nouns. If not possible, duplicate the only one available.
    4) If zero nouns, fall back to content words (adjectives/verbs) or finally random tokens.

    Returns:
        (noun1, noun2)  # exactly two strings
    """
    if seed is not None:
        rnd = random.Random(seed)
    else:
        rnd = random

    text = (gt_text_label or "").strip()
    if not text:
        return ("", "")

    doc = nlp(text)

    # 1) noun candidates (lemmas)
    nouns = []
    for t in doc:
        if t.pos_ in {"NOUN", "PROPN"}:
            lemma = t.lemma_.strip().lower()
            if lemma and lemma.isalpha() and len(lemma) > 2:
                nouns.append(lemma)

    # 2) lightweight filtering (remove very generic junk)
    junk = {
        "thing", "stuff", "something", "anything", "everything",
        "someone", "anyone", "everyone",
    }
    nouns = [n for n in nouns if n not in junk]

    if nouns:
        if prefer_distinct:
            unique = list(dict.fromkeys(nouns))  # preserve order, unique
            if len(unique) >= 2:
                # pick two distinct nouns (random but deterministic if seed set)
                w1, w2 = rnd.sample(unique, 2)
                return (w1, w2)
            else:
                # only one unique noun exists
                return (unique[0], unique[0])
        else:
            # allow duplicates / frequency-based sampling
            if len(nouns) >= 2:
                return tuple(rnd.sample(nouns, 2))  # type: ignore
            return (nouns[0], nouns[0])

    # Fallback A: take content words (adjectives/verbs) if no nouns exist
    content = []
    for t in doc:
        if t.pos_ in {"ADJ", "VERB"}:
            lemma = t.lemma_.strip().lower()
            if lemma and lemma.isalpha() and len(lemma) > 2:
                content.append(lemma)

    if content:
        unique = list(dict.fromkeys(content))
        if len(unique) >= 2:
            return tuple(rnd.sample(unique, 2))  # type: ignore
        return (unique[0], unique[0])

    # Fallback B: any alphabetic tokens
    tokens = [t.text.lower() for t in doc if t.text.isalpha() and len(t.text) > 2]
    if len(tokens) >= 2:
        return tuple(rnd.sample(tokens, 2))  # type: ignore
    if len(tokens) == 1:
        return (tokens[0], tokens[0])

    return ("", "")

ModuleNotFoundError: No module named 'spacy'

In [4]:
label = "I tried to teach my cactus to swim, but the submarine filed a complaint."
print(choose_two_nouns_from_label(label, seed=42))

('complaint', 'cactus')
