***Importing Libraries***

In [1]:
import spacy 

***Create blank language object and tokenize words in a sentence***

In [2]:
nlp = spacy.blank("en")

In [6]:
doc = nlp("Mr. Ahmed visited lahore and there he love the digital museum and the entrance fee was 150 for Pakistani's and 500 for other nationalities")

for token in doc:
    print(token)

Mr.
Ahmed
visited
lahore
and
there
he
love
the
digital
museum
and
the
entrance
fee
was
150
for
Pakistani
's
and
500
for
other
nationalities


***Indexing for Grabbing Tokens***

In [7]:
doc[3]

lahore

In [8]:
doc[11]

and

In [56]:
doc = nlp('''"Let's go to the Zoo!"''')

for token in doc:
    print(token)

"
Let
's
go
to
the
Zoo
!
"


In [57]:
doc[-1]

"

In [58]:
doc[2:7]

's go to the Zoo

In [59]:
type(nlp)

spacy.lang.en.English

In [60]:
type(doc)

spacy.tokens.doc.Doc

***Spanning***

In [61]:
span = doc[0:6]
span

"Let's go to the

In [62]:
type(span)

spacy.tokens.span.Span

***Token Attributes***

In [20]:
doc = nlp("Ahmed asked for $ from me.")

In [21]:
token1 = doc[0]
token1

Ahmed

In [23]:
type(token1)

spacy.tokens.token.Token

In [22]:
dir(token1)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [24]:
token1.is_alpha

True

In [25]:
token1.like_num

False

In [27]:
token2 = doc[2]
token2.text

'for'

In [28]:
token5 = doc[5]
token5.text

'me'

In [30]:
token5.is_currency

False

In [31]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Ahmed ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
asked ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
for ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
from ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
me ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


***Collecting emails from Student information sheet***

In [33]:
with open("C:/Users/user/Desktop/student.txt") as f:
    text = f.readlines()
text

['\n',
 'Dayton high school, 8th grade students information\n',
 '\n',
 ' Name      birth day           email\n',
 ' -------   ----------          ------\n',
 ' Virat     5 June, 1882      virat@kohli.com\n',
 ' Maria     12 April, 2001    maria@sharapova.com\n',
 ' Serena    24 June, 1998     serena@williams.com\n',
 ' Joe        1 May, 1997      joe@root.com']

In [34]:
text = ' '.join(text)
text



In [37]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

***Supporting Urdu Langague***

In [41]:
nlp = spacy.blank("hi")

doc = nlp("کیا آپ مہربانی کر کے مجھے 500 روپے قرض دیں گے؟")
for token in doc:
    print(token, token.is_currency, token.like_num)

کیا False False
آپ False False
مہربانی False False
کر False False
کے False False
مجھے False False
500 False True
روپے False False
قرض False False
دیں False False
گے False False
؟ False False


***Customizing Tokenizer***

In [42]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [43]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

***Sentence Tokenization***

In [45]:
nlp.pipe_names

[]

In [46]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x106b1ff77d0>

In [48]:
doc = nlp("Mr. Ahmed visited lahore and there he love the digital museum. The entrance fee was 150 for Pakistani's and 500 for other nationalities")

for sentence in doc.sents:
    print(sentence)

Mr. Ahmed visited lahore and there he love the digital museum.
The entrance fee was 150 for Pakistani's and 500 for other nationalities


# Collect all the dataset websites from the text using spacy

In [49]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [51]:
doc = nlp(text)
url = []
for token in doc:
    if token.like_url:
        url.append(token.text)
url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

# Extract all money transaction from below sentence along with currency. Output should be, two$ 500 €. 

In [53]:
doc = nlp("Tony gave two $ to Peter, Bruce gave 500 € to Steve")

for token in doc:
    print(token, token.is_currency)

Tony False
gave False
two False
$ True
to False
Peter False
, False
Bruce False
gave False
500 False
€ True
to False
Steve False


In [54]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)  

two $
500 €
