In [28]:
import nltk

In [29]:
sent = "The curious cat cat jumped jumped gracefully over the fence, exploring the unknown."
sent

'The curious cat cat jumped jumped gracefully over the fence, exploring the unknown.'

# TOKENIZING SENTENCE

In [30]:
from nltk.tokenize import word_tokenize
token_sent = word_tokenize(sent)
print(token_sent)

['The', 'curious', 'cat', 'cat', 'jumped', 'jumped', 'gracefully', 'over', 'the', 'fence', ',', 'exploring', 'the', 'unknown', '.']


# REMOVING PUNCTUATIONS AND STOPWORDS FROM THE TOKENIZED SENTENCE

In [31]:
import string
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
new_sent = [c for c in token_sent if c not in punc]
new_sent

['The',
 'curious',
 'cat',
 'cat',
 'jumped',
 'jumped',
 'gracefully',
 'over',
 'the',
 'fence',
 'exploring',
 'the',
 'unknown']

In [33]:
from nltk.corpus import stopwords

In [34]:
new_sent1 = [c for c in new_sent if c not in stopwords.words("english")]
new_sent1

['The',
 'curious',
 'cat',
 'cat',
 'jumped',
 'jumped',
 'gracefully',
 'fence',
 'exploring',
 'unknown']

# PARTS OF SPEECH TAGGING

In [35]:
nltk.pos_tag(new_sent1)

[('The', 'DT'),
 ('curious', 'JJ'),
 ('cat', 'NN'),
 ('cat', 'NN'),
 ('jumped', 'VBD'),
 ('jumped', 'VBD'),
 ('gracefully', 'RB'),
 ('fence', 'JJ'),
 ('exploring', 'VBG'),
 ('unknown', 'JJ')]

# LEMMATIZATION AND STEMMING 

In [36]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for w in new_sent1:
    print(ps.stem(w))

the
curiou
cat
cat
jump
jump
grace
fenc
explor
unknown


In [37]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
for w in new_sent1:
    print(wl.lemmatize(w))

The
curious
cat
cat
jumped
jumped
gracefully
fence
exploring
unknown


# TFIDF VECTORIZER

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x=tfidf.fit_transform(new_sent1)
print(x)
print()
print(x.toarray())

  (0, 6)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 5)	1.0
  (5, 5)	1.0
  (6, 4)	1.0
  (7, 3)	1.0
  (8, 2)	1.0
  (9, 7)	1.0

[[0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]


In scikit-learn's TfidfVectorizer, the norm parameter controls the normalization of the TF-IDF matrix. Setting norm="none" disables normalization, meaning that the TF-IDF values will not be normalized.

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(norm=None)
x=tfidf.fit_transform(new_sent1)
print(x)
print()
print(x.toarray())

  (0, 6)	2.7047480922384253
  (1, 1)	2.7047480922384253
  (2, 0)	2.2992829841302607
  (3, 0)	2.2992829841302607
  (4, 5)	2.2992829841302607
  (5, 5)	2.2992829841302607
  (6, 4)	2.7047480922384253
  (7, 3)	2.7047480922384253
  (8, 2)	2.7047480922384253
  (9, 7)	2.7047480922384253

[[0.         0.         0.         0.         0.         0.
  2.70474809 0.        ]
 [0.         2.70474809 0.         0.         0.         0.
  0.         0.        ]
 [2.29928298 0.         0.         0.         0.         0.
  0.         0.        ]
 [2.29928298 0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         2.29928298
  0.         0.        ]
 [0.         0.         0.         0.         0.         2.29928298
  0.         0.        ]
 [0.         0.         0.         0.         2.70474809 0.
  0.         0.        ]
 [0.         0.         0.         2.70474809 0.         0.
  0.         0.        ]
 [0.         0.        

In [48]:
tfidf.inverse_transform(x)


[array(['the'], dtype='<U10'),
 array(['curious'], dtype='<U10'),
 array(['cat'], dtype='<U10'),
 array(['cat'], dtype='<U10'),
 array(['jumped'], dtype='<U10'),
 array(['jumped'], dtype='<U10'),
 array(['gracefully'], dtype='<U10'),
 array(['fence'], dtype='<U10'),
 array(['exploring'], dtype='<U10'),
 array(['unknown'], dtype='<U10')]