##### Exercise 1

- Extend rule-set of RegexpTagger to handle close-class words (similar to punctuation & DET):

    - prepositions (ADP)
        - in, among, of, above, etc (add as many you want)
    - particles (PRT)
        - to, well, up, now, not (add as many you want)
    - pronouns (PRON)
        - I, you, he, she, it, they, we (add as many you want)
    - conjunctions (CONJ)
        - and, or, but, while, when, since (add as many you want)

- Evaluate 

In [None]:
aug_rules = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'),   # cardinal numbers
    (r'(The|the|A|a|An|an)$', 'DET'),   # articles
    (r'.*able$', 'ADJ'),                # adjectives
    (r'.*ness$', 'NOUN'),               # nouns formed from adjectives
    (r'.*ly$', 'ADV'),                  # adverbs
    (r'.*s$', 'NOUN'),                  # plural nouns
    (r'.*ing$', 'VERB'),                # gerunds
    (r'.*ed$', 'VERB'),                 # past tense verbs
    (r'.*ed$', 'VERB'),                 # past tense verbs
    (r'[\.,!\?:;\'"]', '.'),            # punctuation (extension) 
    (r'$(in|among|of|above)', 'ADP'),                      # Add prepositions
    (r'$(to|well|up|now|not)', 'PRT'),                      # Add particles
    (r'$(I|you|he|she|it|they|we)', 'PRON'),                     # Add pronouns
    (r'$(and|or|but|while|when|since)', 'CONJ'),                     # Add conjunctions
    (r'.*', 'NOUN')                     # nouns (default)

]
aug_re_tagger = RegexpTagger(aug_rules)

# tagging sentences in test set
for s in treebank.sents()[:train_indx]:
    print("INPUT: {}".format(s))
    print("TAG  : {}".format(aug_re_tagger.tag(s)))
    break

accuracy = aug_re_tagger.accuracy(tst_data)
# Or = aug_re_tagger.evaluate(tst_data)
print("Accuracy: {:6.4f}".format(accuracy))

### Exercise 2 
Evaluate spaCy NER model on the conll2002 corpus and compare the results with NLTK trained model.

To do this you have to:

- Load the spaCy model for the Spanish language (`es_core_news_sm`) or you can try with larger models
- Retrieve spaCy prediction with IOB schema
- Evaluate the model with the conll script

In [None]:
from spacy.tokenizer import Tokenizer

def iobify(w):
    return w.ent_iob_ + "-" + w.ent_type_ if w.ent_iob_ != "O" else w.ent_iob_

nlp = spacy.load("es_core_news_sm")
# We overwrite the spacy tokenizer with a custom one, that split by whitespace only. However, it is a suboptimal solution.
nlp.tokenizer = Tokenizer(nlp.vocab)

# getting references (try to replace testa with testb)
refs = [[(text, iob) for text, pos, iob in sent] for sent in conll2002.iob_sents('esp.testa')]
# Use spaCy model for predicting the Named Entities
hyps = [[(w.text, iobify(w)) for w in nlp(" ".join(s))] for s in conll2002.sents('esp.testa')]

results = evaluate(refs, hyps)

# The total F1 is a micro-F1
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)