In [1]:
import re
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
with open('entities-bm.txt','r') as fopen:
    texts= list(filter(None, fopen.read().split('\n')))
len(texts)

12194

In [3]:
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
tag_idx = 1
char_idx = 1

In [4]:
def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def read_file(f):
    global tag_idx, char_idx
    words, tags, X, Y = [], [], [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            continue
        else:
            ls = line.split(' ')
            if len(ls) > 1:
                word, tag = ls[0],ls[-1]
            else:
                word = ls[0]
                tag = 'O'
            word = process_word(word)
            if len(word) < 1:
                continue
            char_ids = []
            for c in word:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
                char_ids.append(char2idx[c])
            words += [word]
            tags += [tag]
            X.append(char_ids)
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            Y.append(tag2idx[tag])
                        
    return words, tags, X, np.array(Y)

In [5]:
words, _, _, Y = read_file(texts)

In [6]:
bow = CountVectorizer(ngram_range=(1, 1), analyzer='char').fit(words)
tfidf = TfidfVectorizer(ngram_range=(1, 1), analyzer='char').fit(words)

In [7]:
X = np.hstack([bow.transform(words).todense(),tfidf.transform(words).todense()])

In [8]:
bayes = MultinomialNB().fit(X, Y)
predicted = bayes.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))

accuracy validation set:  0.7554972103708566
             precision    recall  f1-score   support

       NORP       0.00      0.00      0.00       405
        FAC       0.79      0.95      0.86      9275
        ART       0.52      0.18      0.26      1190
        ORG       0.10      0.10      0.10       130
       TIME       0.33      0.34      0.33       304
        PAD       0.25      0.08      0.13       106
        LAW       0.33      0.09      0.15       106
        PRN       0.31      0.04      0.07       514
        DOC       0.33      0.05      0.08        64
        LOC       0.00      0.00      0.00        93
      EVENT       0.00      0.00      0.00         1

avg / total       0.68      0.76      0.70     12188



  .format(len(labels), len(target_names))


In [9]:
results = []
for i in range(1,5):
    results.append(CountVectorizer(ngram_range=(1, i), analyzer='char').fit_transform(words).todense())
    
X = np.hstack(results)
X.shape

(12188, 12025)

In [10]:
bayes = MultinomialNB().fit(X, Y)
predicted = bayes.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))

accuracy validation set:  0.8108795536593371
             precision    recall  f1-score   support

       NORP       0.49      0.54      0.51       405
        FAC       0.91      0.89      0.90      9275
        ART       0.59      0.66      0.63      1190
        ORG       0.71      0.44      0.54       130
       TIME       0.44      0.57      0.50       304
        PAD       0.52      0.28      0.37       106
        LAW       0.61      0.18      0.28       106
        PRN       0.43      0.57      0.49       514
        DOC       0.69      0.17      0.28        64
        LOC       0.76      0.30      0.43        93
      EVENT       0.00      0.00      0.00         1

avg / total       0.82      0.81      0.81     12188



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [11]:
results = []
for i in range(1,5):
    results.append(TfidfVectorizer(ngram_range=(1, i), analyzer='char').fit_transform(words).todense())
    
X = np.hstack(results)
X.shape

(12188, 12025)

In [12]:
bayes = MultinomialNB().fit(X, Y)
predicted = bayes.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))

accuracy validation set:  0.8188382015096817
             precision    recall  f1-score   support

       NORP       0.68      0.29      0.41       405
        FAC       0.85      0.97      0.91      9275
        ART       0.64      0.52      0.57      1190
        ORG       0.95      0.15      0.25       130
       TIME       0.42      0.32      0.36       304
        PAD       0.38      0.11      0.17       106
        LAW       0.55      0.10      0.17       106
        PRN       0.64      0.28      0.39       514
        DOC       0.67      0.03      0.06        64
        LOC       0.90      0.10      0.17        93
      EVENT       0.00      0.00      0.00         1

avg / total       0.80      0.82      0.79     12188



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
