In [0]:
import spacy
sp = spacy.load('en_core_web_sm') # starting = 'en' means English, last 'sm' means small
# In the script above we use the load function from the spacy library to load the core English language model. The model is stored in the sp variable.

In [3]:
# Word tokenizer:-
doc = sp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc: ## SpaCy automatically breaks your document into tokens when a document is created using the model.
  print(token)    ## print(token.text)

## doc = "Apple is looking at buying U.K. startup for $1 billion"
## for token in doc: 
##   print(token)
print('#######')
# Sentence tokenizer:-
corpus = sp('hey anurag. how are you. can we meet, again')
for sentence in corpus.sents:
  print(sentence)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
#######
hey anurag.
how are you.
can we meet, again


In [0]:
#############################################################

In [8]:
## Stop Words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
len(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


179

In [9]:
len(set(stopwords.words('english')))

179

In [10]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

326

In [13]:
# Remove stopwords bt using NLTK and spacy:-
sent = 'The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.'
print(sent)
nltk.download('punkt')
print('By NLTK:-')
print(' '.join([word for word in nltk.word_tokenize(sent) if word not in stopwords.words('english')]))

print('By SPACY:-')
sent = sp('The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.')
print(' '.join([str(word.text) for word in sent if word.text not in spacy_stopwords]))

The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
By NLTK:-
The university founded 1885 Leland Jane Stanford memory child , Leland Stanford Jr. , died typhoid fever age 15 previous year . Stanford former Governor California U.S . Senator ; made fortune railroad tycoon . The school admitted first students October 1 , 1891 , [ 2 ] [ 3 ] coeducational non-denominational institution .
By SPACY:-
The university founded 1885 Leland Jane Stanford memory child , Leland Stanford Jr. , died typhoid fever age 15 previous year . Stanford Governor Cali

In [14]:
for word in sp('Hey are you coming Anurag'):
  if word.text not in spacy_stopwords:
    print(word)

Hey
coming
Anurag


In [15]:
'are' not in spacy_stopwords

False

In [0]:
###################################################################

In [17]:
# POS tagging:-
doc[1].lemma_, doc[1].pos_

('be', 'AUX')

In [18]:
for word in doc:
  print(word, word.pos_)

Apple PROPN
is AUX
looking VERB
at ADP
buying VERB
U.K. PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM


In [21]:
# Explain tag:-
## tag_ lists the fine-grained part of speech.
## pos_ lists the coarse-grained part of speech.

for word in doc:
  print(word,',', word.tag_, ',', word.pos_, ',', spacy.explain(word.tag_))

Apple , NNP , PROPN , noun, proper singular
is , VBZ , AUX , verb, 3rd person singular present
looking , VBG , VERB , verb, gerund or present participle
at , IN , ADP , conjunction, subordinating or preposition
buying , VBG , VERB , verb, gerund or present participle
U.K. , NNP , PROPN , noun, proper singular
startup , NN , NOUN , noun, singular or mass
for , IN , ADP , conjunction, subordinating or preposition
$ , $ , SYM , symbol, currency
1 , CD , NUM , cardinal number
billion , CD , NUM , cardinal number


In [22]:
# Explain POS:-
for word in doc:
  print(word,',', word.tag_, ',', word.pos_, ',', spacy.explain(word.pos_))

Apple , NNP , PROPN , proper noun
is , VBZ , AUX , auxiliary
looking , VBG , VERB , verb
at , IN , ADP , adposition
buying , VBG , VERB , verb
U.K. , NNP , PROPN , proper noun
startup , NN , NOUN , noun
for , IN , ADP , adposition
$ , $ , SYM , symbol
1 , CD , NUM , numeral
billion , CD , NUM , numeral


In [0]:
####################################################################

In [0]:
# NER:-
doc = sp("Apple is looking at buying U.K. startup for $1 billion")
for entity in doc.ents:
  print(entity)  ## These are the entity

for entity in doc.ents:
  print(entity,':-', entity.label_)

Apple
U.K.
$1 billion
Apple :- ORG
U.K. :- GPE
$1 billion :- MONEY


In [0]:
sent = sp('The university was founded in 1885 by Leland and Jane Stanford in memory of their only child, Leland Stanford Jr., who had died of typhoid fever at age 15 the previous year. Stanford was a former Governor of California and U.S. Senator; he made his fortune as a railroad tycoon. The school admitted its first students on October 1, 1891,[2][3] as a coeducational and non-denominational institution.')
for entity in sent.ents:
  print((entity, entity.label_))

(1885, 'DATE')
(Leland, 'ORG')
(Jane Stanford, 'PERSON')
(Leland Stanford Jr., 'PERSON')
(age 15 the previous year, 'DATE')
(Stanford, 'ORG')
(California, 'GPE')
(U.S., 'GPE')
(first, 'ORDINAL')
(October 1, 1891,[2][3, 'DATE')


In [0]:
# https://stackabuse.com/python-for-nlp-tokenization-stemming-and-lemmatization-with-spacy-library/
# https://www.datacamp.com/community/blog/spacy-cheatsheet
# https://realpython.com/natural-language-processing-spacy-python/