<a href="https://colab.research.google.com/github/GaborVxxx/ml_notes/blob/main/VectorModelsAndTextProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stemming and Lemmatization

* Stemming: It cuts the word simply to try to get the root word
* Lemmatization: Its a dict to map words to its base word (more complex and sofistication)

In [48]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [49]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

In [50]:
poter_stem = porter.stem('running')
print(poter_stem)
lancaster_stem = lancaster.stem('bosses')
print(lancaster_stem)
snowball_stem = snowball.stem('ran') # stemming is not get the right base word in this case
print(snowball_stem)

run
boss
ran


In [51]:
sentence = "Lemmatization is more sophisticated than stemming".split()
for word in sentence:
    print(porter.stem(word), end=" ") # see the outcome

lemmat is more sophist than stem 

# Looking at Lemmatization

In [52]:
from nltk.stem import WordNetLemmatizer

In [53]:
nltk.download('wordnet') # import / download the dictionary to map the words

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
from nltk.corpus import wordnet

In [55]:
lemmatizer = WordNetLemmatizer()

In [56]:
lemmatizer.lemmatize("walking")

'walking'

In [57]:
lemmatizer.lemmatize("walking", pos=wordnet.VERB)

'walk'

In [58]:
lemmatizer.lemmatize("going")

'going'

In [59]:
lemmatizer.lemmatize("going", pos=wordnet.VERB)

'go'

In [60]:
lemmatizer.lemmatize("better", pos=wordnet.ADJ)

'good'

# Mapping the right pos

In [69]:
# Define a mapping from NLTK POS tags to WordNet POS
def get_wordnet_pos(treebank_tag):
    print(f"Tag: {treebank_tag}")
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if tag is not found

In [71]:
nltk.download('averaged_perceptron_tagger') # get the package

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [74]:
sentence = "Donald Trump has a devoted following".split()

In [75]:
words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('Donald', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN')]

In [76]:
for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    print(lemma, end=" ")


Tag: NNP
Donald Tag: NNP
Trump Tag: VBZ
have Tag: DT
a Tag: VBN
devote Tag: NN
following 