In [1]:
import nltk
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.tag import brill, brill_trainer 
from nltk.tag import BrillTaggerTrainer, RegexpTagger
from nltk.tag.perceptron import PerceptronTagger
from nltk.tbl.template import Template
from nltk.tag.brill import Pos, Word
from nltk.corpus import brown
from nltk.tag import CRFTagger

http://www.nltk.org/api/nltk.tag.html

In [2]:
def export_to_csv(pos_tagged_text, filename):
    string_pos_text = [ nltk.tag.tuple2str(tup) for tup in pos_tagged_text]
    one_line = ' '.join(string_pos_text)
    wfp = open(filename, 'w')
    wfp.write(one_line)
    wfp.close()
    print("Exported Data to: {}".format(filename))

In [3]:
# Load data
filename = 'data/feedback_cs2012_1.txt'
with open(filename, 'r')as fp:
    all_content = fp.read()
text = nltk.word_tokenize(all_content)

In [4]:
# Default Tagger
pos_tagged_text = nltk.pos_tag(text)
export_to_csv(pos_tagged_text, 'data/default_tagger.csv')

Exported Data to: data/default_tagger.csv


In [5]:
# Prepare Test Data 
test_data = []
test_data.append(pos_tagged_text)

In [6]:
# Prepare Train data
train_data = brown.tagged_sents(categories='news')[:1000]

In [7]:
#PerceptronTagger
tagger2 = PerceptronTagger(load=False)
tagger2.train(train_data)
perceptron_tagged_data = tagger2.tag(text)
accuracy = tagger2.evaluate(test_data)
export_to_csv(tagger2.tag(text), 'data/perception_tagger.csv')
print("Perception Tagger Accuracy: {}".format(accuracy))

Exported Data to: data/perception_tagger.csv
Perception Tagger Accuracy: 0.5556510528577567


In [8]:
# http://www.nltk.org/api/nltk.tag.html?highlight=hmm#module-nltk.tag.brill_trainer
#RegexpTagger
baseline = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'),   # articles
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])

regex_tagged_data = baseline.tag(text)
baseline.evaluate(test_data) 

export_to_csv(regex_tagged_data, 'data/regrex_tagger.csv')
print("Regrex Tagger Accuracy: {}".format(accuracy))

Exported Data to: data/regrex_tagger.csv
Regrex Tagger Accuracy: 0.5556510528577567


In [9]:
# http://www.nltk.org/api/nltk.tag.html?highlight=hmm#module-nltk.tag.brill_trainer
#BrillTaggerTrainer
templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
tt = BrillTaggerTrainer(baseline, templates, trace=3)
tagger1 = tt.train(train_data, max_rules=10)
brill_tagged_data = tagger1.tag(text)
accuracy = tagger1.evaluate(test_data) 

export_to_csv(tagger1.tag(text), 'data/brill_tagger.csv')
print("Brill Tagger Accuracy: {}".format(accuracy))

TBL train (fast) (seqs: 1000; tokens: 22079; tpls: 2; min score: 2; min acc: None)
Finding initial useful rules...
    Found 4286 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
 613 613   0   0  | NN->. if Pos:NN@[-1] & Word:.@[0]
 605 605   0  20  | NN->, if Pos:NN@[-1] & Word:,@[0]
 501 501   0  35  | NN->IN if Pos:NN@[-1] & Word:of@[0]
 256 256   0   2  | NN->IN if Pos:NN@[-1] & Word:in@[0]
 221 221   0  10  | NN->CC if Pos:NN@[-1] & Word:and@[0]
 189 189   0 123  | NN->TO if Pos:NN@[-1] & Word:to@[0]
 173 173   0   0  | NN->. if Pos:NNS@[-1] & Word:.@[0]
 172 404 2321115  | NN->IN if Pos:NNS@[-1]
 167 206  39 345  | NN->IN if Pos:

In [10]:
# http://www.nltk.org/api/nltk.tag.html?highlight=hmm#module-nltk.tag.crf
# CRF Tagger
ct = CRFTagger()
ct.train(train_data,'model.crf.tagger')
accuracy = ct.evaluate(test_data)

export_to_csv(ct.tag_sents([text])[0], 'data/crf_tagger.csv')
print("CRF Tagger Accuracy: {}".format(accuracy))

Exported Data to: data/crf_tagger.csv
CRF Tagger Accuracy: 0.5612376450365277
