[Reference](https://towardsdatascience.com/introduction-to-nlp-part-2-difference-between-lemmatisation-and-stemming-3789be1c55bc)

In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
# Import packages
import pandas as pd
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

# Instantiate stemmers and lemmatiser
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatiser = WordNetLemmatizer()

# Create function that normalises text using all three techniques
def normalise_text(words, pos='v'):
    """Stem and lemmatise each word in a list. Return output in a dataframe."""
    normalised_text = pd.DataFrame(index=words, columns=['Porter', 'Lancaster', 'Lemmatiser'])
    for word in words:
        normalised_text.loc[word,'Porter'] = porter.stem(word)
        normalised_text.loc[word,'Lancaster'] = lancaster.stem(word)
        normalised_text.loc[word,'Lemmatiser'] = lemmatiser.lemmatize(word, pos=pos)
    return normalised_text

In [3]:
normalise_text(['apples', 'pears', 'tasks', 'children', 'earrings', 'dictionary', 'marriage', 'connections', 'universe', 'university'], pos='n')

Unnamed: 0,Porter,Lancaster,Lemmatiser
apples,appl,appl,apple
pears,pear,pear,pear
tasks,task,task,task
children,children,childr,child
earrings,ear,ear,earring
dictionary,dictionari,dict,dictionary
marriage,marriag,marry,marriage
connections,connect,connect,connection
universe,univers,univers,universe
university,univers,univers,university


In [4]:
normalise_text(['pie', 'globe', 'house', 'knee', 'angle', 'acetone', 'time', 'brownie', 'climate', 'independence'], pos='n')

Unnamed: 0,Porter,Lancaster,Lemmatiser
pie,pie,pie,pie
globe,globe,glob,globe
house,hous,hous,house
knee,knee,kne,knee
angle,angl,angl,angle
acetone,aceton,aceton,acetone
time,time,tim,time
brownie,browni,browny,brownie
climate,climat,clim,climate
independence,independ,independ,independence


In [5]:
normalise_text(['wrote', 'thinking', 'remembered', 'relies', 'ate', 'gone', 'won', 'ran', 'swimming', 'mistreated'], pos='v')

Unnamed: 0,Porter,Lancaster,Lemmatiser
wrote,wrote,wrot,write
thinking,think,think,think
remembered,rememb,rememb,remember
relies,reli,rely,rely
ate,ate,at,eat
gone,gone,gon,go
won,won,won,win
ran,ran,ran,run
swimming,swim,swim,swim
mistreated,mistreat,mist,mistreat


In [18]:
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
from nltk.tokenize import RegexpTokenizer

# Import data
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((tag, movie_reviews.raw(fileid)))
sample = pd.DataFrame(reviews, columns=['target', 'document'])

# Prepare one giant string 
sample_string = " ".join(sample['document'].values)

# Tokenise data
tokeniser = RegexpTokenizer(r'\w+')
tokens = tokeniser.tokenize(sample_string)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [19]:
%%timeit
lemmatiser = WordNetLemmatizer()
[lemmatiser.lemmatize(token, 'v') for token in tokens]

1 loop, best of 3: 4.73 s per loop


In [11]:
%%timeit 
porter = PorterStemmer()
[porter.stem(token) for token in tokens]

1 loop, best of 3: 20.4 s per loop


In [12]:
%%timeit 
lancaster = LancasterStemmer()
[lancaster.stem(token) for token in tokens]

1 loop, best of 3: 15.6 s per loop


# Part-of-speech tag 

In [13]:
lemmatiser = WordNetLemmatizer()
print(f"Lemmatising 'remembered' with pos='v' results in: {lemmatiser.lemmatize('remembered', 'v')}")
print(f"Lemmatising 'remembered' with pos='n' results in: {lemmatiser.lemmatize('remembered', 'n')}\n")
print(f"Lemmatising 'universities' with pos='v' results in: {lemmatiser.lemmatize('universities', 'v')}")
print(f"Lemmatising 'universities' with pos='n' results in: {lemmatiser.lemmatize('universities', 'n')}")

Lemmatising 'remembered' with pos='v' results in: remember
Lemmatising 'remembered' with pos='n' results in: remembered

Lemmatising 'universities' with pos='v' results in: universities
Lemmatising 'universities' with pos='n' results in: university


In [14]:
print(f"Lemmatising 'Remembered' with pos='v' results in: {lemmatiser.lemmatize('Remembered', 'v')}")
print(f"Lemmatising 'Remembered' with pos='n' results in: {lemmatiser.lemmatize('Remembered', 'n')}\n")
print(f"Lemmatising 'Universities' with pos='v' results in: {lemmatiser.lemmatize('Universities', 'v')}")
print(f"Lemmatising 'Universities' with pos='n' results in: {lemmatiser.lemmatize('Universities', 'n')}")

Lemmatising 'Remembered' with pos='v' results in: Remembered
Lemmatising 'Remembered' with pos='n' results in: Remembered

Lemmatising 'Universities' with pos='v' results in: Universities
Lemmatising 'Universities' with pos='n' results in: Universities
