In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
text = "I am the happiest person. I have a cat."
doc = nlp(text)

In [3]:
## WORD TOKENIZE
## Tokenize words to get the tokens of the text i.e breaking the sentences into words.
from collections import Counter
words = [token.text for token in doc]
words

['I', 'am', 'the', 'happiest', 'person', '.', 'I', 'have', 'a', 'cat', '.']

In [4]:
## SENTENCE TOKENIZE
## Tokenize sentences if the there are more than 1 sentence i.e breaking the sentences to list of sentence.
list(doc.sents)

[I am the happiest person., I have a cat.]

In [5]:
## STOP WORDS REMOVAL
## Remove irrelevant words using nltk stop words like is,the,a etc from the sentences as they don’t carry any information.
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
words

['happiest', 'person', 'cat']

In [6]:
## Lemma
## lemmatize the text so as to get its root form eg: functions,funtionality as function
for token in doc:
    print(token, token.lemma_, token.lower_, token.lemma_.lower().strip())

I I i
am be am
the the the
happiest happy happiest
person person person
. . .
I I i
have have have
a a a
cat cat cat
. . .


In [7]:
## Get word frequency
## counting the word occurrence using FreqDist library. Word frequency helps us to determine how important the word is in the document 
## by knowing how many times the word is being used.

#remove stopwords and punctuations
words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
word_freq = Counter(words)
common_words = word_freq.most_common(5)
for key, value in common_words:
    print(f"{key}: {value}")

happiest: 1
person: 1
cat: 1


In [8]:
## POS tags
## POS tag helps us to know the tags of each word like whether a word is noun, adjective etc.
for w in doc:
    print (w, w.pos_)

I PRON
am AUX
the DET
happiest ADJ
person NOUN
. PUNCT
I PRON
have VERB
a DET
cat NOUN
. PUNCT
