In [1]:
import spacy

In [4]:
sp = spacy.load('en_core_web_sm')

In [12]:
sentence = sp('Manchester United isn\'t looking to sign a forward for $90 million')

In [13]:
for word in sentence:
    print(word.text, word.pos_, word.dep_)

Manchester PROPN compound
United PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
to PART aux
sign VERB xcomp
a DET det
forward NOUN dobj
for ADP prep
$ SYM quantmod
90 NUM compound
million NUM pobj


In [16]:
for i in sentence.sentiment:
    print(i)

TypeError: 'float' object is not iterable

In [20]:
document = sp('Hello from Stackabuse. The site with the best Python Tutorials. What are you looking for?')

In [21]:
for word in document.sents:
    print(word)

Hello from Stackabuse.
The site with the best Python Tutorials.
What are you looking for?


In [25]:
document[4], document[4].is_sent_start

(The, True)

# Tokenization

### tokenization is the process of breaking a document down into words, punctuation marks, numeric digits, etc.

In [27]:
sentence3 = sp(u'"They\'re leaving U.K. for U.S.A."')
print(sentence3)

"They're leaving U.K. for U.S.A."


In [28]:
for word in sentence3:
    print(word.text)

"
They
're
leaving
U.K.
for
U.S.A.
"


In [29]:
sentence4 = sp(u"Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com")
print(sentence4)

Hello, I am non-vegetarian, email me the menu at abc-xyz@gmai.com


In [30]:
for word in sentence4:
    print(word.text)

Hello
,
I
am
non
-
vegetarian
,
email
me
the
menu
at
abc-xyz@gmai.com


In [32]:
len(sentence4)

14

# Detecting Entities


In [33]:
sentence5 = sp(u'Manchester United is looking to sign Harry Kane for $90 million')  

for word in sentence5:
    print(word.text)

Manchester
United
is
looking
to
sign
Harry
Kane
for
$
90
million


In [38]:
for entity in sentence5.ents:
    print(entity.text +" - "+ entity.label_ +" ---- "+ str(spacy.explain(entity.label_)) )

Manchester United - ORG ---- Companies, agencies, institutions, etc.
Harry Kane - PERSON ---- People, including fictional
$90 million - MONEY ---- Monetary values, including unit


# Detecting Nouns

In [40]:
for nouns in sentence5.noun_chunks:
    print(nouns.text)

Manchester United
Harry Kane


# Stemming

#### Stemming refers to reducing a word to its root form. 

In [41]:
# It might be surprising to you but spaCy doesn't contain any 
#function for stemming as it relies on lemmatization only. 
#Therefore, we will use NLTK for stemming.

### Porter Stemmer


In [42]:
import nltk
from nltk.stem.porter import *

In [47]:
stemmer = PorterStemmer()
#stemmer = SnowballStemmer(language='english')


In [44]:
tokens = ['compute', 'computer', 'computed', 'computing']

In [46]:
for token in tokens:
    print(token +"------>"+ stemmer.stem(token))

compute------>comput
computer------>comput
computed------>comput
computing------>comput


### comput is not actually a word.

### This is where lemmatization comes handy. 

### Lemmatization reduces the word to its stem as it appears in the dictionary. 


### The stems returned through lemmatization are actual dictionary words and are semantically complete unlike the words returned by stemmer.

## Lemmatization

In [48]:
sentence7 = sp(u'A letter has been written, asking him to be released')

for word in sentence7:
    print(word.text + '  ===>', word.lemma_)

A  ===> a
letter  ===> letter
has  ===> have
been  ===> be
written  ===> write
,  ===> ,
asking  ===> ask
him  ===> -PRON-
to  ===> to
be  ===> be
released  ===> release


In [50]:
from spacy.lang.en import English

nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

In [54]:
for i in my_doc:
    print(type(i))

<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens.token.Token'>
<class 'spacy.tokens