In [82]:
import spacy
import warnings

from IPython.display import display
from spacy.lang.en import English
from spacy.attrs import ORTH
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy.tokens.token import Token

warnings.filterwarnings("ignore")

In [83]:
nlp: English = spacy.blank("en")

In [84]:
doc: Doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only two $ per plate.")
# doc: Doc = nlp('"Let\'s go to N.Y.!"')

## Word Tokenization

In [85]:
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
two
$
per
plate
.


In [86]:
token: Token = doc[0]
print(token)

token_span = doc[1:5]
print(token_span)

Dr.
Strange loves pav bhaji


In [87]:
for token in doc:
    print(f"Token: {token}\n\tIndex: {token.i}\n\tis_alpha: {token.is_alpha}\n\tis_punct: {token.is_punct}\n\tlike_num: {token.like_num}\n\tis_currency: {token.is_currency}")

Token: Dr.
	Index: 0
	is_alpha: False
	is_punct: False
	like_num: False
	is_currency: False
Token: Strange
	Index: 1
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: loves
	Index: 2
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: pav
	Index: 3
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: bhaji
	Index: 4
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: of
	Index: 5
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: mumbai
	Index: 6
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: as
	Index: 7
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: it
	Index: 8
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: costs
	Index: 9
	is_alpha: True
	is_punct: False
	like_num: False
	is_currency: False
Token: only
	Index: 10
	is_alpha: True
	is_punct: False
	like_num: False
	is_cu

### A Use Case

In [88]:
with open("students.txt", mode="r") as f:
    text: list[str] = f.readlines()

display(text)

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [89]:
text: str = "".join(text)
print(text)

Dayton high school, 8th grade students information

Name	birth day   	email
-----	------------	------
Virat   5 June, 1882    virat@kohli.com
Maria	12 April, 2001  maria@sharapova.com
Serena  24 June, 1998   serena@williams.com 
Joe      1 May, 1997    joe@root.com






In [90]:
doc = nlp(text)

emails: list[Token] = []
for token in doc:
    if token.like_email:
        emails.append(token)

display(emails)

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

## A Sinhala Language Model

In [91]:
nlp = spacy.blank("si")

doc = nlp("මගේ නම දුලින පෙරේරා. ඔබේ නම කුමක්ද?")

for token in doc:
    print(token)

මගේ
නම
දුලින
පෙරේරා
.
ඔබේ
නම
කුමක්ද
?


## Customize Tokenization Rules

In [92]:
nlp = spacy.blank("en")

doc = nlp("gimme double cheese extra large healthy pizza")

tokens: list[str] = [token.text for token in doc]
display(tokens)

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [93]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])

doc = nlp("gimme double cheese extra large healthy pizza")

tokens: list[str] = [token.text for token in doc]
display(tokens)

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

## Sentence Tokenization

In [94]:
nlp.add_pipe("sentencizer")

doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi.")

for sent in doc.sents:
    print(sent)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi.
