# Writing a stricter tokenizer for the blank German spaCy Model

## Tokenizer Theory Notes

Tokenizer exception: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied.

Prefix: Character(s) at the beginning, e.g. $, (, “, ¿.

Suffix: Character(s) at the end, e.g. km, ), ”, !.

Infix: Character(s) in between, e.g. -, --, /, ….

In [1]:
# === SETUP ===

# ---Importing---
import os
from pathlib import Path


# ---Setting Path---
print(os.getcwd())

# Coco Path
path = Path("C:/Users/CocoL/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/02-Coding")

# Jona Path
# path = r"/Users/jhoff/Universität St.Gallen/STUD-Capstoneproject Tell 6 - Dokumente/General/02-Coding"

# Giovanni Path
# path = r"/Users/jonathanebner/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/02-Coding"

# Leo Path
# path = r"/Users/Leonidas/Universität St.Gallen/STUD-Capstoneproject Tell 6 - General/02-Coding"

c:\Users\CocoL\OneDrive - Universitaet St.Gallen\2-Academics\Bachelor\7-Semester\Capstone\ReciParse_Scripts\04_building models\tokenizer


In [None]:
# === VARIANTE 1 ===

# ---Importing---
from spacy.lang.de import German
nlp = German()


# ---Before---
print("VORHER:")
print([tok for tok in nlp("bedeckt sein).Im ca. 180° vorgeheizten Ofen bei")]) # "sein).Im" -->is bad
print([tok for tok in nlp("1.Obst waschen(sauber).Und dann ")])
print([tok for tok in nlp("1.)Herdäpfel sind geil.2)Deine Mum auch.")])

# ---Change Tokenizer---
import re
def custom_tokenizer(nlp):
    
    my_prefixes = [r'[0-9]\.', r"\.", r"^\."]
    
    all_prefixes_re = spacy.util.compile_prefix_regex(tuple(list(nlp.Defaults.prefixes) + my_prefixes))
    
    # Handle ( that doesn't have proper spacing around it
    custom_infixes = [r'\.\.\.+', r'(?<=[0-9])-(?=[0-9])', r'[!&:,()]', r"\."]
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + custom_infixes))
    
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)   
    
    return Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
                     prefix_search = all_prefixes_re.search, 
                     infix_finditer = infix_re.finditer, 
                     suffix_search = suffix_re.search,
                     token_match=None)

# Add new Tokenizer to NLP Object
nlp.tokenizer = custom_tokenizer(nlp)

# ---After---
print("\n\nNACHER:")
print([tok for tok in nlp("bedeckt sein).Im ca. 180° vorgeheizten Ofen bei")]) 
print([tok for tok in nlp("1.Obst waschen(sauber).Und dann ")])
print([tok for tok in nlp("1.)Herdäpfel sind geil.2)Deine Mum auch.")])

# ---Saving---
nlp.to_disk(path / "04-Models" / "Custom Tokenizer" / "de_custom_v1")

In [None]:
# === VARIANTE 2 ===

# ---Importing---
from spacy.lang.de import German
nlp = German()


# ---Before---
print("VORHER:")
print([tok for tok in nlp("bedeckt sein).Im ca. 180° vorgeheizten Ofen bei")]) # "sein).Im" -->is bad
print([tok for tok in nlp("1.Obst waschen(sauber).Und dann ")])
print([tok for tok in nlp("1.)Herdäpfel) sind geil.2)Deine Mum auch.")])


# ---Change Tokenizer---
import re
def custom_tokenizer(nlp):
    
    my_prefixes = []
    
    all_prefixes_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    
    # Handle (). that doesn't have proper spacing around it
    custom_infixes = [r'[!&:,\(\)\.]']
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + custom_infixes))
    
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)   
    
    return Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
                     prefix_search = all_prefixes_re.search, 
                     infix_finditer = infix_re.finditer, 
                     suffix_search = suffix_re.search,
                     token_match=None)

# Add new Tokenizer to NLP Object
nlp.tokenizer = custom_tokenizer(nlp) 


# ---After---
print("\n\nNACHER:")
print([tok for tok in nlp("bedeckt sein).Im ca. 180° vorgeheizten Ofen bei")]) 
print([tok for tok in nlp("1.Obst waschen(sauber).Und dann ")])
print([tok for tok in nlp("1.)Herdäpfel) sind geil.2)Deine Mum auch.")])

# ---Saving---
nlp.to_disk(path / "04-Models" / "Custom Tokenizer" / "de_custom_v2")

In [None]:
# === RANDOM TESTING CELL ===

# German.Defaults.prefixes = German.Defaults.prefixes + tuple([r'[0-9]\.'])

# German.Defaults.infixes = German.Defaults.infixes + tuple(['\.\.\.+'])
# German.Defaults.infixes = German.Defaults.infixes + tuple(['(?<=[0-9])-(?=[0-9])'])
# German.Defaults.infixes = German.Defaults.infixes + tuple(['[!&:,()]'])
# nlp.Defaults.prefixes == German.Defaults.prefixes 