# **INTRO TO SPACY**

In [36]:
%pip install spacy



In [37]:
import spacy #faster than nltk

In [38]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m520.3 kB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [39]:
nlp = spacy.load('en_core_web_lg')

In [40]:
text = nlp('GFG is looking for Data Science Interns')
text

GFG is looking for Data Science Interns

## Tokenization

In [41]:
for token in text:
  print(token,type(token),token.text)

GFG <class 'spacy.tokens.token.Token'> GFG
is <class 'spacy.tokens.token.Token'> is
looking <class 'spacy.tokens.token.Token'> looking
for <class 'spacy.tokens.token.Token'> for
Data <class 'spacy.tokens.token.Token'> Data
Science <class 'spacy.tokens.token.Token'> Science
Interns <class 'spacy.tokens.token.Token'> Interns


In [42]:
text = nlp('The cost of Iphone in U.K is 699$') # notice 699$
for token in text:
  print(token,type(token),token.text)

The <class 'spacy.tokens.token.Token'> The
cost <class 'spacy.tokens.token.Token'> cost
of <class 'spacy.tokens.token.Token'> of
Iphone <class 'spacy.tokens.token.Token'> Iphone
in <class 'spacy.tokens.token.Token'> in
U.K <class 'spacy.tokens.token.Token'> U.K
is <class 'spacy.tokens.token.Token'> is
699 <class 'spacy.tokens.token.Token'> 699
$ <class 'spacy.tokens.token.Token'> $


## POS (Parts Of Speech)

In [43]:
for token in text:
  print(token.text,token.pos,token.pos_)

The 90 DET
cost 92 NOUN
of 85 ADP
Iphone 96 PROPN
in 85 ADP
U.K 96 PROPN
is 87 AUX
699 93 NUM
$ 99 SYM


## Sentence Tokenization

In [44]:
txt = nlp('This is sentence one.How are you? This is second sentence. This is last one. Lets study now')
for sent in txt.sents:
  print(sent)

This is sentence one.
How are you?
This is second sentence.
This is last one.
Lets study now


Difference with nltk

In [45]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize,word_tokenize
for sent in sent_tokenize('This is sentence one.How are you? This is second sentence. This is last one. Lets study now'): # it couldn't differentiate one.How
  print(sent)

This is sentence one.How are you?
This is second sentence.
This is last one.
Lets study now


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
word_tokenize('This is sentence one.How are you? This is second sentence. This is last one. Lets study now')

['This',
 'is',
 'sentence',
 'one.How',
 'are',
 'you',
 '?',
 'This',
 'is',
 'second',
 'sentence',
 '.',
 'This',
 'is',
 'last',
 'one',
 '.',
 'Lets',
 'study',
 'now']

# Stop words

In [47]:
nlp = spacy.load('en_core_web_sm')

In [48]:
s='''
Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs.
It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data.
It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution.
                 We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or structured.
The definition and the name came up in the 1980s and 1990s when some professors, IT Professionals, scientists were looking into the statistics curriculum, and they thought it would be better to call it data science and then later on data analytics derived.
'''

In [49]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [50]:
len(nlp.Defaults.stop_words)

326

## checking a word is stop word or not

In [51]:
print(nlp.vocab['the'].is_stop,nlp.vocab['like'].is_stop)

True False


## Adding custom words in stop_words

In [66]:
nlp.Defaults.stop_words.add('i.e')
len(nlp.Defaults.stop_words)

327

In [67]:
print(nlp.vocab['i.e'].is_stop)
nlp.vocab['i.e'].is_stop = True
print(nlp.vocab['i.e'].is_stop)

False
True


## Removing custom words from stop_words

In [68]:
nlp.Defaults.stop_words.remove('i.e')
nlp.vocab['i.e'].is_stop = False
nlp.vocab['i.e'].is_stop

False

## Removing stop words from corpus

In [69]:
s

'Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution.We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or

In [70]:
s = s.replace('\n','')
s = s.replace('  ','')
s = s.strip()
s

'Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution.We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or

In [71]:
corp = nlp(s)
corp

Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. It is a process, not an event. It is the process of using data to understand too many different things, to understand the world. Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution.We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the data is unstructured or 

In [74]:
stop_word = []
for token in corp:
  if token.is_stop:
    stop_word.append(token.text)
print(stop_word)

['is', 'the', 'of', 'is', 'a', 'of', 'it', '’s', 'the', 'of', 'is', 'has', 'and', 'we', 'to', 'them', 'if', 'we', '’re', 'to', 'on', 'them', 'and', 'some', 'It', 'is', 'a', 'not', 'an', 'It', 'is', 'the', 'of', 'using', 'to', 'too', 'many', 'to', 'the', 'when', 'you', 'have', 'a', 'or', 'of', 'a', 'and', 'you', 'to', 'that', 'or', 'with', 'your', 'It', 'is', 'the', 'of', 'the', 'and', 'that', 'are', 'or', 'behind', 'It', '’s', 'when', 'you', 'into', 'a', 'So', 'to', 'And', 'with', 'these', 'you', 'can', 'make', 'for', 'a', 'or', 'an', 'We', 'can', 'also', 'as', 'a', 'that', 'is', 'about', 'and', 'to', 'of', 'various', 'and', 'from', 'various', 'whether', 'the', 'is', 'or', 'The', 'and', 'the', 'name', 'up', 'in', 'the', 'and', 'when', 'some', 'IT', 'were', 'into', 'the', 'and', 'they', 'it', 'would', 'be', 'to', 'call', 'it', 'and', 'then', 'on']


In [75]:
print(len(stop_word),len(set(stop_word)))

125 55


In [76]:
stop_word = set(stop_word)

In [79]:
' '.join([token.text for token in corp if not token.is_stop])

'Data science study data . Like biological sciences study biology , physical sciences , study physical reactions . Data real , data real properties , need study going work . Data Science involves data signs . process , event . process data understand different things , understand world . Let Suppose model proposed explanation problem , try validate proposed explanation model data . skill unfolding insights trends hiding ( abstract ) data . translate data story . use storytelling generate insight . insights , strategic choices company institution . define data science field processes systems extract data forms resources data unstructured structured . definition came 1980s 1990s professors , Professionals , scientists looking statistics curriculum , thought better data science later data analytics derived .'

# **WordNet (Synonyms and Antonyms)**

In [82]:
# WordNet is the lexical database i.e. dictionary for the English language, specifically designed for natural language processing.
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [83]:
wordnet.synsets('Book')

[Synset('book.n.01'),
 Synset('book.n.02'),
 Synset('record.n.05'),
 Synset('script.n.01'),
 Synset('ledger.n.01'),
 Synset('book.n.06'),
 Synset('book.n.07'),
 Synset('koran.n.01'),
 Synset('bible.n.01'),
 Synset('book.n.10'),
 Synset('book.n.11'),
 Synset('book.v.01'),
 Synset('reserve.v.04'),
 Synset('book.v.03'),
 Synset('book.v.04')]

In [87]:
syn = wordnet.synsets('Book')
print(syn[2])
print(syn[2].definition())

Synset('record.n.05')
a compilation of the known facts regarding something or someone


## Printing synonymns

In [92]:
synonyms = []
for syn in wordnet.synsets('Happy'):
  for lemma in syn.lemmas():
    synonyms.append(lemma.name())
print(synonyms)

['happy', 'felicitous', 'happy', 'glad', 'happy', 'happy', 'well-chosen']


## Printing Antonyms

In [96]:
antonyms = []

for ant in wordnet.synsets('Good'):
  for lemma in ant.lemmas():
    if lemma.antonyms():
      antonyms.append(lemma.antonyms()[0].name())
print(antonyms)

['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']
