In [45]:
from typing import Any, List, Union
import spacy
from spacy.language import Language


def ensure_doc(
    input: Union[str, List[str], spacy.tokens.doc.Doc], nlp: Union[Language, str], batch_size: int = 1000
) -> spacy.tokens.doc.Doc:
    """Converts string or list inputs to spaCy docs.

    Args:
        input: A string, list of tokens, or a spaCy doc.
        nlp: The language model to use.
        batch_size: The number of texts to accumulate in an internal buffer.

    Returns:
        A spaCy doc, unannotated if derived from a string or list of tokens.
    """
    if isinstance(input, spacy.tokens.doc.Doc):
        return input
    else:
        if isinstance(nlp, str):
            nlp = spacy.load(nlp)
        if isinstance(input, str):
            return list(nlp.tokenizer.pipe([input], batch_size=batch_size))[0]
        elif isinstance(input, list):
            return list(nlp.tokenizer.pipe([" ".join(input)], batch_size=batch_size))[0]
        else:
            raise Exception("Bad data type.")


In [46]:
doc2 = ensure_doc(123, "en_core_web_sm")
print(type(doc2))
for t in doc2:
    print(t.text)


Exception: Bad data type.

In [47]:
from typing import Iterable

isinstance(3.2, Iterable)

False

In [63]:
from typing import Iterable
from pydantic import (
    BaseModel,
    ValidationError,
)
import spacy
nlp = spacy.load("en_core_web_sm")

from typing import Protocol

class Windows(Protocol):
    windows = 1

class AveragesModel(BaseModel):
    patterns: Union[list, str]
    windows: Windows
    search_method: str
    nlp: spacy.vocab.Vocab

    class Config:
        arbitrary_types_allowed = True



In [64]:
a = AveragesModel(patterns=1, windows=[], search_method="x", nlp=nlp.vocab)

ValidationError: 1 validation error for AveragesModel
windows
  instance of Windows expected (type=type_error.arbitrary_type; expected_arbitrary_type=Windows)