In [1]:
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

for token in doc:
    print(token.text)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [5]:
type(nlp)

spacy.lang.en.English

In [4]:
type(doc)

spacy.tokens.doc.Doc

It has already done word tokenization. Doc object is a sequence of Token objects.

Accessing tokens by index:

In [3]:
doc[0]

Dr.

In [11]:
subset = doc[0:5]
print(subset)
type(subset)

Dr. Strange loves pav bhaji


spacy.tokens.span.Span

In [12]:
type(doc[0])

spacy.tokens.token.Token

Each token object has many attributes. For example, token.text returns the string representation of the token. token.lemma_ returns the lemma of the token. token.pos_ returns the coarse-grained POS tag of the token. token.tag_ returns the fine-grained POS tag of the token. token.dep_ returns the dependency label of the token. token.shape_ returns the shape of the token. token.is_alpha returns True if the token consists of alphabetic characters. token.is_stop returns True if the token is a stop word.

In [13]:
doc = nlp("Tony gave two $ to Peter.")

In [14]:
token0 = doc[0]
token0

Tony

In [15]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [19]:
token0.like_num

False

In [20]:
token2 = doc[2]
token2

two

In [21]:
token2.like_num

True

In [22]:
token3 = doc[3]
token3

$

In [23]:
token3.is_currency

True

In [24]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
Peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


Information retrieval from a file:

In [25]:
with open("students.txt", "r") as file:
    text = file.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

We need to join the lines of the text file together and then split the text into sentences. We can use the sentencizer component of SpaCy to do sentence tokenization.

In [26]:
text = "".join(text)
text



In [27]:
doc = nlp(text)
emails = [token for token in doc if token.like_email]
emails

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

In [28]:
doc = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In the example above "gimme" is a token. It is a contraction of "give" and "me". We need t customize the tokenizer to split contractions. But splitting by changing the actual text is not allowed. So we will split it to "gim" and "me".

In [30]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [{ORTH: "gim"}, {ORTH: "me"}])

In [31]:
doc = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

Since we only get tokenizer from a blank model, we need to add the sentencizer component to the blank model. We can use nlp.add_pipe() to add a component to the pipeline.

In [32]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x1917fd8ec00>

In [33]:
nlp.pipe_names

['sentencizer']

In [34]:
doc = nlp("This is the first sentence. This is another sentence. This is the last sentence.")

for sent in doc.sents:
    print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


Exrecises:

In [37]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)
urls = [token.text for token in doc if token.like_url]
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [40]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc = nlp(transactions)

i=0

while(i<len(doc)):
    token = doc[i]
    if token.like_num and doc[i+1].is_currency:
        print(token.text, doc[i+1].text)
        i+=2
    else:
        i+=1

two $
500 €
