Count Vectorizer


In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
corpus1 = ['Tiger hunts to eat', "wulf and Lion hunt to feed their family", 'Leopord hunts to eat first and feed his family']

In [13]:
vectorizer = CountVectorizer()

c1 = vectorizer.fit_transform(corpus1)

print(f'Corpus used is \n{vectorizer.get_feature_names_out()} \nresulting as \n{c1.toarray()}')

Corpus used is 
['and' 'eat' 'family' 'feed' 'first' 'his' 'hunt' 'hunts' 'leopord' 'lion'
 'their' 'tiger' 'to' 'wulf'] 
resulting as 
[[0 1 0 0 0 0 0 1 0 0 0 1 1 0]
 [1 0 1 1 0 0 1 0 0 1 1 0 1 1]
 [1 1 1 1 1 1 0 1 1 0 0 0 1 0]]


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Extract words in order of appearance
ordered_vocab = []
for sentence in corpus1:
    for word in sentence.split():  # Splitting each sentence into words
        if word.lower() not in ordered_vocab:
            ordered_vocab.append(word.lower())  # Keeping lowercase for consistency

# Create Vectorizer with a fixed vocabulary
vectorizer = CountVectorizer(vocabulary=ordered_vocab)

X = vectorizer.transform(corpus1)

print("Ordered Vocabulary:", ordered_vocab)
print("\nVectorized Output:\n", X.toarray())


Ordered Vocabulary: ['tiger', 'hunts', 'to', 'eat', 'wulf', 'and', 'lion', 'hunt', 'feed', 'their', 'family', 'leopord', 'first', 'his']

Vectorized Output:
 [[1 1 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 1 1 1 1 1 1 0 0 0]
 [0 1 1 1 0 1 0 0 1 0 1 1 1 1]]


Stemming

In [26]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print(ps.stem('Fighting'))
print(ps.stem('Defence'))
print(ps.stem('running'))
print(ps.stem('missed'))
print(ps.stem('dealed'))
print(ps.stem('having'))
print(ps.stem('Adorable'))
print(ps.stem('Best'))
print(ps.stem('Was'))

fight
defenc
run
miss
deal
have
ador
best
wa


Lemmetization

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
lem = WordNetLemmatizer()
lem.lemmatize('mice')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'mouse'

In [3]:
print(lem.lemmatize('going', pos=wordnet.VERB))
print(lem.lemmatize('going'))
print(lem.lemmatize('Best'))
print(lem.lemmatize('best', pos=wordnet.ADJ))
print(lem.lemmatize('was', pos='v'))
print(lem.lemmatize('was'),'\n')

print(lem.lemmatize('feet'))
print(lem.lemmatize('cars'))

go
going
Best
best
be
wa 

foot
car


In [4]:

print(lem.lemmatize("running", pos="v"))
print(lem.lemmatize("better", pos="a"))
print(lem.lemmatize("geese", pos="n"))
print(lem.lemmatize("was", pos="v"))
print(lem.lemmatize("studies", pos="n"))

run
good
goose
be
study


In [5]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN
    

In [6]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
sent = "I will surely fulfill my trusted destiny with my enormous blessings and learning".split()

In [8]:
words_and_tags = nltk.pos_tag(sent)
words_and_tags

[('I', 'PRP'),
 ('will', 'MD'),
 ('surely', 'RB'),
 ('fulfill', 'VB'),
 ('my', 'PRP$'),
 ('trusted', 'JJ'),
 ('destiny', 'NN'),
 ('with', 'IN'),
 ('my', 'PRP$'),
 ('enormous', 'JJ'),
 ('blessings', 'NNS'),
 ('and', 'CC'),
 ('learning', 'VBG')]

In [11]:
for word, tag in words_and_tags:
    lem_res = lem.lemmatize(word, pos=get_wordnet_pos(tag))
    print(lem_res, end=' ')
    

I will surely fulfill my trusted destiny with my enormous blessing and learn 

In [12]:
for token in sent:
    lems = lem.lemmatize(token)
    print(lems, end=' ')

I will surely fulfill my trusted destiny with my enormous blessing and learning 

In [14]:
sentences = [
    "The children are playing in the garden.",
    "She is running faster than her friends.",
    "He was reading an interesting book.",
    "They have been watching movies all day.",
    "The cats are chasing the mice around the house."
]

In [34]:
for one_sent in sentences:
    words_and_tags = nltk.pos_tag(one_sent.split())
    lemmatized_sentence = []
    for word, tag in words_and_tags:
        lemmatized_sentence.append(lem.lemmatize(word, pos=get_wordnet_pos(tag)))
    lemmatized_sentence = ' '.join(lemmatized_sentence)

    print(sentences.index(one_sent)+1, "Original sentence:", one_sent)
    print("Lemmatized sentence:", lemmatized_sentence, '\n')

    

1 Original sentence: The children are playing in the garden.
Lemmatized sentence: The child be play in the garden. 

2 Original sentence: She is running faster than her friends.
Lemmatized sentence: She be run faster than her friends. 

3 Original sentence: He was reading an interesting book.
Lemmatized sentence: He be read an interesting book. 

4 Original sentence: They have been watching movies all day.
Lemmatized sentence: They have be watch movie all day. 

5 Original sentence: The cats are chasing the mice around the house.
Lemmatized sentence: The cat be chase the mouse around the house. 

