In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("""Dr. strange ordered samosas, ravioli etc. for his lunch.""")
for sentences in doc.sents:
  print(sentences)

Dr. strange ordered samosas, ravioli etc.
for his lunch.


`spacy.load("en_core_web_sm")` *loads a complete pipeline that can work around english language and that contains tokenizer(sentence and word),lemmatizer,stemmer i.e. complete package........*

In [3]:
nlp1 = spacy.blank('en')
doc1 = nlp1("""Dr. strange ordered samosas, ravioli etc. for his lunch.""")
for sentences in doc1.sents:
  print(sentences)

ValueError: ignored

`spacy.blank('en')` *loads a simple class that can only do word tokenizer and for other purposes you have to explicitly download some other package.*

In [4]:
for token in doc:
  print(token)

Dr.
strange
ordered
samosas
,
ravioli
etc
.
for
his
lunch
.


*see it only has word tokenizer*

In [5]:
text = "Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.BMW's gross cost of operating vehicles in FY2021 S1 was $8 billion."

doc2 = nlp(text)
wd = doc2[0] #Accessing the token by index numbers

In [6]:
print(doc2[0])

Tesla


In [7]:
dir(wd) #To see what are the functions possible to perform in the token

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_extension',
 'has_vector',
 'head',
 'i',
 'idx',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'left_edge',
 'lefts',
 'lemma',
 'lemma_',
 'lex_id',
 'like_email',
 'li

In [8]:
doc3 = nlp("There are 2 oranges and three apples")
for token in doc2:
  print(token)

Tesla
's
gross
cost
of
operating
lease
vehicles
in
FY2021
Q1
was
$
4.85
billion
.
BMW
's
gross
cost
of
operating
vehicles
in
FY2021
S1
was
$
8
billion
.


In [9]:
e = doc3[2] #Assigning the token 2 to e
f = doc3[5] #Assigning the token three to f

In [10]:
e.like_num #Checking whether the token e is a number

True

In [11]:
e.is_alpha #Checking whether the token e is a alphabet

False

In [12]:
f.is_alpha #Checking whether the token f is a alphabet

True

In [13]:
f.like_num #Checking whether the token f is a number (since three also denotes the number it returns true in both cases)

True

In [14]:
text = '''
Follow our leader Elon musk on twitter here: https://twitter.com/elonmusk, more information 
on Tesla's products can be found at https://www.tesla.com/. Also here are leading influencers 
for tesla related news,
https://twitter.com/teslarati
https://twitter.com/dummy_tesla
https://twitter.com/dummy_2_tesla
'''

doc4 = nlp(text)
url = []

for token in doc4:
  if token.like_url: #Checking whether the token is a url and appending into the list
    url.append(token.text)

url

['https://twitter.com/elonmusk',
 'https://www.tesla.com/.',
 'https://twitter.com/teslarati',
 'https://twitter.com/dummy_tesla',
 'https://twitter.com/dummy_2_tesla']

Q1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)
This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [15]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

nlp = spacy.load("en_core_web_sm")
doc5 = nlp(text)
websites = []
for token in doc5:
  if token.like_url:
    websites.append(token.text)

websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [16]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc6 = nlp(transactions)

for i in range(len(doc6)):
  if doc6[i].is_currency:
    print(doc6[i-1].text , doc6[i].text) 

two $
500 €


In [17]:
email = []

mail = "There are 2 mails you can contact me by one is haarks2019@gmail.com and another one is my personal mail abs@ghi.com"
doc7 = nlp(mail)

for token in doc7:
  if token.like_email:  #token.like_email to check whether the token is a mail
    email.append(token.text)

email

['haarks2019@gmail.com', 'abs@ghi.com']

In [20]:
lem = "flying fly flew playing played studying study studied studies"
doc8 = nlp(lem)

for token in doc8:
  print(token.text,"==>",token.lemma_)  #token.lemma_ to lemmatize the token

flying ==> fly
fly ==> fly
flew ==> fly
playing ==> play
played ==> play
studying ==> study
study ==> study
studied ==> study
studies ==> study


In [21]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc9 = nlp(text)
for ent in doc9.ents:
  print(ent, "||", ent.label_,"||", spacy.explain(ent.label_)) #Named Entity Recognition (NER)

the United Kingdom || GPE || Countries, cities, states
Two || CARDINAL || Numerals that do not fall under another type
the General Social Survey || ORG || Companies, agencies, institutions, etc.
the European Social Survey || ORG || Companies, agencies, institutions, etc.


In [22]:
doc10 = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc10:
  print(token, "||", token.pos_, "||",spacy.explain(token.pos_)) #Part of Speech tagging 

Captain || PROPN || proper noun
america || PROPN || proper noun
ate || VERB || verb
100 || NUM || numeral
$ || SYM || symbol
of || ADP || adposition
samosa || NOUN || noun
. || PUNCT || punctuation
Then || ADV || adverb
he || PRON || pronoun
said || VERB || verb
I || PRON || pronoun
can || VERB || verb
do || AUX || auxiliary
this || DET || determiner
all || DET || determiner
day || NOUN || noun
. || PUNCT || punctuation
