## NLP - Machine learning in taggers

#### Import NLTK and Spanish CESS corpus

In [2]:
import nltk 
from nltk.corpus import cess_esp

# Load all tagged sentences of CESS corpus
sents = cess_esp.tagged_sents()

#### Make a training and test dataset

In [4]:
# 90 % training
# 10 % test

training = []
test = []
for i in range(len(sents)) :
    if i % 10 :
        training.append(sents[i])
    else :
        test.append(sents[i])

#### Import the four types of morphology analyzer (taggers)

In [6]:
# UnigramTagger -> it learn of the each word's statistics on CESS corpus
# BigramTagger -> it learn of the each word's statistics and its previous word
# TrigramTagger -> it learn of the each word's statistics and its two previous words
# HiddenMarkovModelTagger -> it is the more complete model
from nltk import UnigramTagger, BigramTagger, TrigramTagger
from nltk.tag.hmm import HiddenMarkovModelTagger

#### Training the taggers

In [8]:
unigram_tagger = UnigramTagger(training)

# The backoff is unigram_tagger because if the tagger cannot tag the word, it use the unigram_tagger like backoff
bigram_tagger = BigramTagger(training, backoff=unigram_tagger) 
trigram_tagger = TrigramTagger(training, backoff=unigram_tagger)

hmm_tagger = HiddenMarkovModelTagger.train(training)


#### Evaluate the taggers

In [10]:
print ('Success with unigrams:',unigram_tagger.evaluate(test)*100)
print ('Success with bigrams:',bigram_tagger.evaluate(test)*100)
print ('Success with trigrams:',trigram_tagger.evaluate(test)*100)
print ('Success with HMMs:',hmm_tagger.evaluate(test)*100)

Success with unigrams: 87.65970871234029
Success with bigrams: 89.42636311057363
Success with trigrams: 89.01624691098375
Success with HMMs: 89.88905831011094


#### Words with None tag
If the tagger cannot tag a word because it is not on corpus in the training or it does not belong the found context, the tagger tags that word with the None tag. Let's see that:

In [12]:
# Chuchetes is not on corpus
sentence = "Los perros son buenos chuchetes"

tokens = nltk.word_tokenize(sentence)
tagged = trigram_tagger.tag(tokens)

print (tagged)

[('Los', 'da0mp0'), ('perros', 'ncmp000'), ('son', 'vsip3p0'), ('buenos', 'aq0mp0'), ('chuchetes', None)]
