In [2]:
#ProcessingPipeLineComponent
#tokenization

import spacy

nlp = spacy.load('en_core_web_md')
doc = nlp('I own a ginger cat.')
tokenized_doc = [token.text for token in doc]
print(tokenized_doc)


['I', 'own', 'a', 'ginger', 'cat', '.']


In [3]:
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("It's been a crazy week!!!")
print([token.text for token in doc])

['It', "'s", 'been', 'a', 'crazy', 'week', '!', '!', '!']


In [4]:
#add special case rule to an existing tokenizer

import spacy
from spacy.symbols import ORTH
nlp = spacy.load('en_core_web_md')
doc = nlp('lemme that')
print([w.text for w in doc])

['lemme', 'that']


In [6]:
special_case = [{ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([w.text for w in nlp("lemme that")])

['lem', 'me', 'that']


In [7]:
print([w.text for w in nlp("lemme!!")])

['lem', 'me', '!', '!']


In [8]:
#debugging the tokenizer

import spacy
nlp = spacy.load("en_core_web_md")
text = "Let's go!"
doc = nlp(text)

tok_exp = nlp.tokenizer.explain(text)
for t in tok_exp:
    print(t[1], "\t", t[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


In [10]:
#sentence segmentation
import spacy
nlp = spacy.load('en_core_web_md')
text = "I flied to N.Y yesterday. It was around 5pm"

doc = nlp(text)
for sent in doc.sents:
    print(sent.text)

I flied to N.Y yesterday.
It was around 5pm


In [12]:
#understanding lemmatization
import spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("I went there for working and worked for 3 years.")
for token in doc:
    print(token.text, token.lemma_)

I I
went go
there there
for for
working working
and and
worked work
for for
3 3
years year
. .


In [23]:
#lemmatization in NLU
import spacy
from spacy.symbols import ORTH

nlp = spacy.load('en_core_web_md')
special_case = [{ORTH: 'Angeltown', ORTH: 'Los Angeles'}]

nlp.tokenizer.add_special_case('Angeltown', special_case)

doc = nlp('I am flying to Angeltown')
for token in doc:
    print(token.text, token.lemma_)

ValueError: [E997] Tokenizer special cases are not allowed to modify the text. This would map 'Angeltown' to 'LosAngeles' given token attributes '[{65: 'LosAngeles'}]'.

 spacy container objects

In [26]:
#Doc
import spacy
nlp = spacy.load('en_core_web_md')

doc = nlp("I like cats.")
doc.text

'I like cats.'

In [27]:
for token in doc:
    print(token.text)

I
like
cats
.


In [28]:
doc[1]

like

In [30]:
len(doc) #number of token

4

In [31]:
doc = nlp("This is a first sentence. This is the second sentence/")
doc.sents

<generator at 0x16e6e8c0ae0>

In [33]:
sentences = list(doc.sents)
print(sentences)

[This is a first sentence., This is the second sentence/]


In [34]:
doc = nlp("I flied to New York with Ashley.")
doc.ents

(New York, Ashley)

In [38]:
doc = nlp("Sweet brown fox jumped over the fence.")
list(doc.noun_chunks)

[Sweet brown fox, the fence]

In [39]:
doc.lang_

'en'

In [41]:
doc = nlp('Hi')
json_doc = doc.to_json()
print(json_doc)

{'text': 'Hi', 'ents': [], 'sents': [{'start': 0, 'end': 2}], 'tokens': [{'id': 0, 'start': 0, 'end': 2, 'tag': 'UH', 'pos': 'INTJ', 'morph': '', 'lemma': 'hi', 'dep': 'ROOT', 'head': 0}]}


In [42]:
#Token
#class: token.text, token.text_with_ws, token.i, token.idx, token.doc, token.sent, token.is_sent_start, token.ent_type

doc = nlp("Hello Madam!")
doc[0]

Hello

In [43]:
doc[0].text

'Hello'

In [44]:
doc[0].text_with_ws

'Hello '

In [45]:
doc[2].text_with_ws

'!'

In [46]:
len(doc[0])

5

In [47]:
token = doc[2]
token.i

2

In [48]:
doc[0].idx

0

In [49]:
doc[1].idx

6

In [50]:
token = doc[0]
token.doc

Hello Madam!

In [52]:
token = doc[1]
token.sent

Hello Madam!

In [53]:
doc = nlp("He entered the room. Then he nodded.")
doc[0].is_sent_start

True

In [54]:
doc[5].is_sent_start

True

In [56]:
doc[6].is_sent_start

False

In [57]:
#lemmatize
doc = nlp("I went there.")
doc[1].lemma_

'go'

In [58]:
doc = nlp("President Trump visited Mexico City.")
doc.ents

(Trump, Mexico City)

In [59]:
doc[1].ent_type_

'PERSON'

In [60]:
doc[3].ent_type_

'GPE'

In [61]:
doc[4].ent_type_

'GPE'

In [62]:
doc[0].ent_type_

''

In [63]:
#Span

doc = nlp("I know that you have been to USA.")
doc[2:4]

that you

In [1]:
import spacy
nlp = spacy.load('en_core_web_md')

In [3]:
doc = nlp("President visited Mexico City")
doc[3:]

City

In [4]:
doc[2: -1]

Mexico

In [5]:
doc[6:]



In [6]:
doc = nlp("You love Atlanta sice you're 20.")
doc.char_span(4, 16)

love Atlanta

In [8]:
doc = nlp('You went there after you saw me.')
span = doc[2:4]
for token in span:
    print(token)

there
after


In [9]:
doc = nlp("Hello Madam!")
span = doc[1:2]
len(span)

1

In [10]:
doc = nlp("You went there after you saw me.")
span = doc[2:6]
span

there after you saw

In [12]:
subspan = span[1:3]
subspan

after you

In [13]:
doc = nlp("You went there after you saw me.")
span = doc[2:6]
span.start

2

In [14]:
span.end

6

In [15]:
span.start_char

9

In [16]:
span.end_char

28

In [17]:
doc = nlp("You went there after you saw me.")
span = doc[2:6]
type(span)

spacy.tokens.span.Span

In [20]:
small_doc = span.as_doc()
type(small_doc)

spacy.tokens.doc.Doc

More Spacy Features

In [21]:
doc = nlp("Hello, hi!")
doc[0].lower_

'hello'

In [23]:
doc = nlp("HELLO, Hello, hello, hEllO")
doc[0].is_upper


True

In [24]:
doc[0].is_lower

False

In [25]:
doc[1].is_upper

False

In [26]:
doc[1].is_lower

False

In [27]:
doc[1].is_alpha

False

In [28]:
doc = nlp("Cat and Cat123")
doc[0].is_alpha

True

In [29]:
doc[2].is_alpha

False

In [30]:
doc = nlp("Hamburg and Göttingen")
doc[0].is_ascii

True

In [31]:
doc[2].is_ascii

False

In [32]:
doc = nlp("Cat Cat123 123")
doc[0].is_digit

False

In [33]:
doc[1].is_digit

False

In [34]:
doc[2].is_digit

True

In [35]:
doc = nlp("You, him and Sally")
doc[1]

,

In [36]:
doc[1].is_punct

True

In [48]:
doc = nlp("([He said yes,])")
doc[0]

(

In [49]:
doc[0].is_left_punct

True

In [50]:
doc[1]

[

In [51]:
doc[1].is_left_punct

True

In [52]:
doc[-1]

)

In [53]:
doc[-1].is_right_punct

True

In [54]:
doc[-2]

]

In [55]:
doc[-2].is_right_punct

True

In [56]:
doc = nlp(" ")
doc[0]

 

In [57]:
len(doc[0])

1

In [58]:
doc[0].is_space

True

In [59]:
doc = nlp("(You said [1] and {2} is not applicable.)")
doc[0].is_bracket, doc[-1].is_bracket

(True, True)

In [60]:
doc[3].is_bracket, doc[5].is_bracket

(True, True)

In [61]:
doc[7].is_bracket, doc[9].is_bracket

(True, True)

In [62]:
doc = nlp("( You said '1\" is not applicable)")
doc[3]

'

In [63]:
doc[3].is_quote

True

In [64]:
doc[5]

"

In [65]:
doc[5].is_quote

True

In [66]:
doc = nlp("I paid 12$ for the tshirt.")
doc[3]

$

In [67]:
doc[3].is_currency

True

In [68]:
doc = nlp("I emailed you at least 100 times")
doc[-2]


100

In [69]:
doc[-2].like_num

True

In [70]:
doc = nlp("I emailed you at least hundred times")
doc[-2]


hundred

In [72]:
doc[-2].like_num

True

In [73]:
doc = nlp("My email is duygu@packt.com and you can visit me under https://duygua.github.io any time you want.")
doc[3]

duygu@packt.com

In [74]:
doc[3].like_email

True

In [75]:
doc[10]

https://duygua.github.io

In [76]:
doc[10].like_url

True

In [77]:
doc = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc:
    print(token.text, token.shape_)

Girl Xxxx
called xxxx
Kathy Xxxxx
has xxx
a x
nickname xxxx
Cat123 Xxxddd
. .


In [78]:
doc = nlp("I visited Jenny at Mynks Resort.")
for token in doc:
    print(token, token.is_oov)

I False
visited False
Jenny False
at False
Mynks True
Resort False
. False


In [79]:
doc = nlp("I just wnat to inform you that I was with the principle.")
for token in doc:
    print(token, token.is_stop)

I True
just True
wnat False
to True
inform False
you True
that True
I True
was True
with True
the True
principle False
. False
