# 1)Getting Started With Spacy Library

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [5]:
text = nlp('GFG is looking for Data Science Interns')

In [6]:
text

GFG is looking for Data Science Interns

###**1.1 Tokenization**

In [8]:
for token in text:
  print(token)

GFG
is
looking
for
Data
Science
Interns


In [9]:
for token in text:
  print(type(token))

<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>


In [10]:
text = nlp('The cost of Iphone in U.K is 699$')

In [11]:
for token in text:
  print(token.text)

The
cost
of
Iphone
in
U.K
is
699
$


### **1.2 POS (parts Of Speech)**

In [16]:
for token in text:
  print(token.text, token.pos)

The 90
cost 92
of 85
Iphone 96
in 85
U.K 96
is 87
699 93
$ 99


In [18]:
for token in text:
  print(token.text, token.pos_)#pos_ indiactes what each word represent like noun, pronoun,symbol etc

The DET
cost NOUN
of ADP
Iphone PROPN
in ADP
U.K PROPN
is AUX
699 NUM
$ SYM


### **1.3 Sentence Tokenization**

In [19]:
text = nlp('This is sentence one. This is second sentence. This is last one. Lets study now')

In [27]:
for sent in text.sents:
  print(sent)

This is sentence one.
This is second sentence.
This is last one.
Lets study now


In [28]:
#using nltk we will see
import nltk

In [29]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [30]:
sent_tokenize('This is sentence one. This is second sentence. This is last one. Lets study now')

['This is sentence one.',
 'This is second sentence.',
 'This is last one.',
 'Lets study now']

In [31]:
word_tokenize('This is sentence one. This is second sentence. This is last one. Lets study now')

['This',
 'is',
 'sentence',
 'one',
 '.',
 'This',
 'is',
 'second',
 'sentence',
 '.',
 'This',
 'is',
 'last',
 'one',
 '.',
 'Lets',
 'study',
 'now']

# 2)StopWords Removal with Spacy

In [33]:
#A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore,
#both when indexing entries for searching and when retrieving them as the result of a search query.

In [34]:
import spacy

In [35]:
nlp=spacy.load('en_core_web_sm')

### 2.1 Printing all the stopwords

In [36]:
print(nlp.Defaults.stop_words)

{'due', 'just', 'because', 'meanwhile', 'on', 'some', 'anyone', 'seem', 'rather', 'nine', 'many', '‘ve', 'two', 'also', 'in', 're', 'put', 'third', 'bottom', 'may', 'further', 'n’t', 'me', 'them', 'everywhere', 'must', 'nowhere', 'should', 'except', 'am', 'less', 'whatever', 'even', 'you', 'is', 'really', 'or', 'a', 'last', 'quite', 'full', 'this', 'anyhow', 'front', 'per', 'cannot', '‘d', 'move', 'under', 'everyone', 'been', 'nobody', 'thus', 'together', "n't", 'throughout', 'whereupon', 'never', 'though', 'once', 'along', 'one', 'yours', 'although', 'side', 'using', 'what', 'however', 'beside', 'ten', 'why', 'becoming', 'are', 'noone', 'now', 'anything', '‘m', 'eleven', 'too', 'hereupon', 'whither', 'made', 'keep', 'moreover', 'off', 'n‘t', 'into', 'but', 'for', 'amount', 'whereby', 'those', 'ever', 'whenever', 'five', 'three', 'across', 'we', 'anyway', "'s", '’re', 'whoever', 'doing', 'of', 'thereafter', 'during', 'via', 'seems', "'ve", 'become', 'mine', 'as', 'none', 'ca', 'beyond'

In [42]:
len(nlp.Defaults.stop_words)

326

### 2.2 Checking the word is a stopword or not

In [38]:
nlp.vocab['is'].is_stop#to check whether it is stop word or not

True

In [39]:
nlp.vocab['GFG'].is_stop

False

In [40]:
nlp.vocab['data'].is_stop

False

In [41]:
nlp.vocab['the'].is_stop

True

### 2.3 Adding our own stopwords

In [43]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [44]:
nlp.Defaults.stop_words.add('data')

In [46]:
nlp.vocab['data'].is_stop = True

In [48]:
len(nlp.Defaults.stop_words)#before it is 326 now it is 327 ,stop word is added

327

### 2.4 To remove a stopword(Custom Stopword)

In [49]:
nlp.vocab['data'].is_stop

True

In [50]:
nlp.Defaults.stop_words.remove('data')

In [51]:
nlp.vocab['data'].is_stop = False

In [52]:
nlp.vocab['data'].is_stop

False

### 2.5 Removing StopWords from Corpus

In [61]:
text ='''
Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions.
Data is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs.
It is a process, not an event. It is the process of using data to understand too many different things, to understand the world.
Let Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data.
It is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story.
So use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution.
We can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the
data is unstructured or structured. The definition and the name came up in the 1980s and 1990s when some professors, IT Professionals,
scientists were looking into the statistics curriculum, and they thought it would be better to call it data science and then later on data analytics derived.
'''

In [68]:
text.replace('\n','')
text.replace('  ','')
text = text.strip()


In [69]:
text

'Data science is the study of data. Like biological sciences is a study of biology, physical sciences, it’s the study of physical reactions. \nData is real, data has real properties, and we need to study them if we’re going to work on them. Data Science involves data and some signs. \nIt is a process, not an event. It is the process of using data to understand too many different things, to understand the world. \nLet Suppose when you have a model or proposed explanation of a problem, and you try to validate that proposed explanation or model with your data. \nIt is the skill of unfolding the insights and trends that are hiding (or abstract) behind data. It’s when you translate data into a story. \nSo use storytelling to generate insight. And with these insights, you can make strategic choices for a company or an institution. \nWe can also define data science as a field that is about processes and systems to extract data of various forms and from various resources whether the \ndata is 

In [70]:
corp = nlp(text)

In [82]:
stopword = []
for token in corp:
  if token.is_stop:
    stopword.append(token)
print(stopword)

[is, the, of, is, a, of, it, ’s, the, of, is, has, and, we, to, them, if, we, ’re, to, on, them, and, some, It, is, a, not, an, It, is, the, of, using, to, too, many, to, the, when, you, have, a, or, of, a, and, you, to, that, or, with, your, It, is, the, of, the, and, that, are, or, behind, It, ’s, when, you, into, a, So, to, And, with, these, you, can, make, for, a, or, an, We, can, also, as, a, that, is, about, and, to, of, various, and, from, various, whether, the, is, or, The, and, the, name, up, in, the, and, when, some, IT, were, into, the, and, they, it, would, be, to, call, it, and, then, on]


In [83]:
len(stopword)

125

In [85]:
unique_stopwords = set([token.text for token in stopword])
len(unique_stopwords)

55

### 6)Printing the words which are not in stopwords in corpus

In [92]:
notstopwords = []
for token in corp:
  if token.is_stop == False:
    notstopwords.append(token)
len(notstopwords)

133

In [94]:
tokens=[token for token in corp if not token.is_stop]#another way of doing it

In [95]:
len(tokens)

133

# 3)Synonyms and Antonyms

In [97]:
#A synonym is a word or phrase with the same or nearly the same meaning as another word or phrase. The words that are similar in meaning are called synonyms.
#An antonym is a term or phrase that has the opposite meaning to another.

In [98]:
#wordnet = whole dictionary of english words.The definition of words like what it represents and all that present in wordnet

In [101]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [103]:
from nltk.corpus import wordnet

In [107]:
synonym = wordnet.synsets('Book')# all the words we see are synonyms of book

In [108]:
synonym

[Synset('book.n.01'),
 Synset('book.n.02'),
 Synset('record.n.05'),
 Synset('script.n.01'),
 Synset('ledger.n.01'),
 Synset('book.n.06'),
 Synset('book.n.07'),
 Synset('koran.n.01'),
 Synset('bible.n.01'),
 Synset('book.n.10'),
 Synset('book.n.11'),
 Synset('book.v.01'),
 Synset('reserve.v.04'),
 Synset('book.v.03'),
 Synset('book.v.04')]

In [113]:
synonym[5].definition()

'having the normally expected amount'

In [109]:
#now we will see synonym of good
synonym = wordnet.synsets('good')

In [110]:
synonym

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [111]:
#we will take first synonyms and we will identify the defintion of it
synonym[1].definition()

'moral excellence or admirableness'

In [116]:
synonym = wordnet.synsets('Car')

In [118]:
print(synonym[0])
synonym[0].definition()

Synset('car.n.01')


'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

### 3.1 Printing the Synonyms

In [129]:
for syn in wordnet.synsets('Car'):
    print(syn.lemmas())

[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
[Lemma('car.n.02.car'), Lemma('car.n.02.railcar'), Lemma('car.n.02.railway_car'), Lemma('car.n.02.railroad_car')]
[Lemma('car.n.03.car'), Lemma('car.n.03.gondola')]
[Lemma('car.n.04.car'), Lemma('car.n.04.elevator_car')]
[Lemma('cable_car.n.01.cable_car'), Lemma('cable_car.n.01.car')]


In [130]:
for syn in wordnet.synsets('Car'):
  for lemma in syn.lemmas():
    print(lemma.name())


car
auto
automobile
machine
motorcar
car
railcar
railway_car
railroad_car
car
gondola
car
elevator_car
cable_car
car


In [132]:
synonyms = []
for syn in wordnet.synsets('Car'):
  for lemma in syn.lemmas():
    synonyms.append(lemma.name())
print(synonyms)#print all the synonyms of car


['car', 'auto', 'automobile', 'machine', 'motorcar', 'car', 'railcar', 'railway_car', 'railroad_car', 'car', 'gondola', 'car', 'elevator_car', 'cable_car', 'car']


In [137]:
synonyms = []
for syn in wordnet.synsets('Healthy'):
  for lemma in syn.lemmas():
    synonyms.append(lemma.name())
print(synonyms)

['healthy', 'healthy', 'healthy', 'salubrious', 'good_for_you', 'healthy', 'intelligent', 'levelheaded', 'level-headed', 'sound', 'goodly', 'goodish', 'healthy', 'hefty', 'respectable', 'sizable', 'sizeable', 'tidy']


### 3.2 Printing the Antonyms

In [134]:
antonym=[]
for ant in wordnet.synsets('Good'):
    for lemma in ant.lemmas():
        if lemma.antonyms():
            antonym.append(lemma.antonyms()[0].name())

print(antonym)

['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


In [135]:
antonym=[]
for ant in wordnet.synsets('Car'):
    for lemma in ant.lemmas():
        if lemma.antonyms():
            antonym.append(lemma.antonyms()[0].name())

print(antonym)#we did not have antonym for car that why not printing

[]


In [136]:
antonym=[]
for ant in wordnet.synsets('Healthy'):
    for lemma in ant.lemmas():
        if lemma.antonyms():
            antonym.append(lemma.antonyms()[0].name())

print(antonym)

['unhealthy']
