# Lemmatization Sample

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("better", pos="a")) # pos -> Part of Speech parameter
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

good
best
run
run


In [89]:
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
# mapping picked up from https://github.com/pararthshah/qa-memnn/blob/master/nltk_utils.py

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return None


'''
takes in a list of tokens of length > 0
returns a list of the most likely part of speech for the token

'''
def get_pos(tokens):
    pos_list = nltk.pos_tag(tokens)
    print(pos_list)
    pos_pegs = []
    for i, (token, pos) in enumerate(pos_list):        
        pos_peg = penn_to_wn(pos)
        if pos_peg is not None:
            pos_pegs.append(pos_peg)
        else:
            pos_pegs.append(None)
    return list(zip(tokens, pos_pegs))

def lemmatize(pos_tagged_tokens):
    lemmatized_tokens = []
    for i, (token, pos) in enumerate(pos_tagged_tokens):
        print(token, pos)
        lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=(pos if pos is not None else 'n')))
    return lemmatized_tokens

#     print(lemmatizer.lemmatize("better", pos="a")) # pos -> Part of Speech parameter
pos_tagged_tokens = get_pos(['apples', 'greenish', 'tallest', 'run', 'quickly', '$', '$400', 'the'])
lemmatized_tokens = lemmatize(pos_tagged_tokens)
print(lemmatized_tokens)

# lemmatizer.lemmatize(text, morphy_tag['VB'])

[('apples', 'NNS'), ('greenish', 'JJ'), ('tallest', 'JJS'), ('run', 'NN'), ('quickly', 'RB'), ('$', '$'), ('$400', 'CD'), ('the', 'DT')]
apples n
greenish a
tallest a
run n
quickly r
$ None
$400 None
the None
['apple', 'greenish', 'tall', 'run', 'quickly', '$', '$400', 'the']


In [90]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn_to_wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return wn.NOUN

def memoize1(f):
    memo = {}
    def helper(x):
        if x not in memo:
            memo[x] = f(x)
        return memo[x]
    return helper

def memoize2(f):
    memo = {}
    def helper(x,y):
        if (x,y) not in memo:
            memo[(x,y)] = f(x, y)
        return memo[(x,y)]
    return helper

def stem_word(word):
    return nltk.stem.snowball.EnglishStemmer().stem(word)

stem_word = memoize1(stem_word)

def get_lemma(word, tag):
    return WordNetLemmatizer().lemmatize(word, tag)

get_lemma = memoize2(get_lemma)

def canonicalize_tokens(tokens):
    canonical_tokens = []
    tags = nltk.pos_tag(tokens)
    for tag in tags:
        wn_tag = penn_to_wn(tag[1])
        t = get_lemma(tag[0], wn_tag)
        t = stem_word(t)
        canonical_tokens.append(t)
    return canonical_tokens

In [91]:
canonicalize_tokens(['apples', 'greenish', 'tallest', 'run', 'quickly', '$', '$400', 'the'])

['appl', 'greenish', 'tall', 'run', 'quick', '$', '$400', 'the']