# NLTK - Corpora, Stemming, & Lemmatization (with POS)
* NLTK Text Corpora - vast repository for a large body of text (corpus) which can be used during NLP

## Use a Corpora, and Stem the Document

In [10]:
import nltk

nltk.download()

showing info http://nltk.org/nltk_data/ 


True

In [4]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### The Text above can be used for Stemming

In [5]:
text_file=nltk.corpus.gutenberg.words('melville-moby_dick.txt')

my_lines_list=[]

for line in text_file:
    my_lines_list.append(line)
    
my_lines_list

['[',
 'Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 ']',
 'ETYMOLOGY',
 '.',
 '(',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar',
 'School',
 ')',
 'The',
 'pale',
 'Usher',
 '--',
 'threadbare',
 'in',
 'coat',
 ',',
 'heart',
 ',',
 'body',
 ',',
 'and',
 'brain',
 ';',
 'I',
 'see',
 'him',
 'now',
 '.',
 'He',
 'was',
 'ever',
 'dusting',
 'his',
 'old',
 'lexicons',
 'and',
 'grammars',
 ',',
 'with',
 'a',
 'queer',
 'handkerchief',
 ',',
 'mockingly',
 'embellished',
 'with',
 'all',
 'the',
 'gay',
 'flags',
 'of',
 'all',
 'the',
 'known',
 'nations',
 'of',
 'the',
 'world',
 '.',
 'He',
 'loved',
 'to',
 'dust',
 'his',
 'old',
 'grammars',
 ';',
 'it',
 'somehow',
 'mildly',
 'reminded',
 'him',
 'of',
 'his',
 'mortality',
 '.',
 '"',
 'While',
 'you',
 'take',
 'in',
 'hand',
 'to',
 'school',
 'others',
 ',',
 'and',
 'to',
 'teach',
 'them',
 'by',
 'what',
 'name',
 'a',
 'whale',
 '-',
 'fish',
 'is',
 'to',
 'be',
 

### Non-English Stemmer are also available
* SnowballStemmers supports many languages

In [6]:
from nltk.stem.snowball import SnowballStemmer

englishStemmer=SnowballStemmer("english")
englishStemmer.stem("having")

'have'

* the stemmer must ignore stopwords - to get 'having' rather than 'have'

In [8]:
englishStemmer2=SnowballStemmer("english", ignore_stopwords=True)

englishStemmer2.stem("having")

'having'

* repeat in Spanish

In [9]:
spanishStemmer=SnowballStemmer("spanish", ignore_stopwords=True)

spanishStemmer.stem("Corriendo")

'corr'

## Lemmatization
* reduces words to roots (Lemmas) - that DO belong to the language (canonical form)

#### Use the WordNet Lemmatizer
* Download the WordNet Corpora from the NLTK downloader

In [11]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words

print("{0:20}{1:20}".format("Word","Lemma"))

for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 


* roots above have Not been given - because Context is Needed

### Provide Context, in which to Lemmatize - the Parts-of-Speech (POS)
* using the parameter, pos - in wordnet_lemmatizer.lemmatize

In [12]:
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))

He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 


* the Roots (Lemmas) are now provided