In [4]:

import spacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("Apple isn't looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
n't
looking
at
buying
U.K.
startup
for
$
1
billion


## Tokenization

In [5]:
#pos tags and dependencies
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
n't not PART RB neg x'x False True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP nsubj X.X. False False
startup startup VERB VBD ccomp xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [7]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 30 34 GPE
$1 billion 47 57 MONEY


In [None]:
from spacy.symbols import ORTH
doc=nlp("gimme that")
print([w.text for w in doc])

# add special case rule
special_case=[{ORTH:"gim"}, {ORTH:"me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# check new tokenization after adding the special case rule
print([w.text for w in nlp("gimme that")])

['gimme', 'that']
['gim', 'me', 'that']


In [10]:
nlp.tokenizer.explain("Let's Go!")

[('SPECIAL-1', 'Let'), ('SPECIAL-2', "'s"), ('TOKEN', 'Go'), ('SUFFIX', '!')]

In [11]:
from spacy.lang.en import English

nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
    print(t[1], "\\t", t[0])

" \t PREFIX
Let \t SPECIAL-1
's \t SPECIAL-2
go \t TOKEN
! \t SUFFIX
" \t SUFFIX


In [16]:
import re
import spacy
from spacy.tokenizer import Tokenizer

special_cases = {":)": [{"ORTH": ":)"}]}
prefix_re = re.compile(r'''^[\\[\\("']''')
# suffix_re = re.compile(r'''[\\]\\)"']$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')

def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, rules=special_cases,
                                prefix_search=prefix_re.search,
                                # suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                url_match=simple_url_re.match)

In [17]:
nlp.tokenizer=custom_tokenizer(nlp)
doc=nlp("hello-world. :)")
print([t.text for t in doc])

['hello', '-', 'world.', ':)']


In [18]:
suffixes=nlp.Defaults.suffixes+[r'''-+$''',]
suffix_regex=spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search=suffix_regex.search

In [None]:
suffixes=list(nlp.Defaults.suffixes)
suffixes.remove("\")

In [None]:
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
nlp=spacy.load("en_core_web_sm")
doc=nlp("mother-in-lay")
print([t.text for t in doc])

['mother', '-', 'in', '-', 'lay']


In [None]:
# mpdify_tokenizer_infix
# Modify tokenizer infix patterns
infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\\-\\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # ✅ Commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
)


In [25]:
from spacy.tokens import Doc
words=["Let","'s", "go", "!"]
spaces=[False, True, False, False]
doc=Doc(nlp.vocab, words=words, spaces=spaces)
print(doc)
print(nlp.vocab.strings)

Let's go!
<spacy.strings.StringStore object at 0x76993e9e5380>


In [None]:
# basic whitespace tokenizer
class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab=vocab

    def __call__(self, text):
        words=text.split(" ")
        spaces=[True]*(len(words))

        # avoid zero length tokens
        for i, word in enumerate(words):
            if word=="":
                words[i]=" "
                spaces[i]=False
        # Remove the final trailing space
        if words[-1]==" ":
            words=words[0:-1]
            spaces=spaces[0:-1]
        else:
            spaces[-1]=False

        return Doc(self.vocab, words=words, spaces=spaces)
    
nlp=spacy.blank("en")
nlp.tokenizer=WhitespaceTokenizer(nlp.vocab)
doc=nlp("What's happened to me? he thought. It wasn't a dream.")
print([token.text for token in doc])

["What's", 'happened', 'to', 'me?', 'he', 'thought.', 'It', "wasn't", 'a', 'dream.']


In [None]:
from tokenizers import BertWordPieceTokenizer
from spacy.tokens import Doc

class BertTokenizer:
    def __init__(self, vocab, vocab_file, lowercase=True):

        self.vocab=vocab
        self._tokenizer=BertWordPieceTokenizer(vocab_file, lowercase=lowercase)

    def __call__(self, text):
        tokens=self._tokenizer.encode(text)
        words=[]
        spaces=[]
        for i, (text, (start,end)) in enumerate()
        
        
        
        