In [1]:
import re
import numpy as np
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier

In [2]:
with open('entities-bm.txt','r') as fopen:
    texts= list(filter(None, fopen.read().split('\n')))
len(texts)

12194

In [3]:
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
tag_idx = 1
char_idx = 1

In [4]:
def process_word(word, lower=True):
    if lower:
        word = word.lower()
    else:
        if word.isupper():
            word = word.title()
    word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    if word.isdigit():
        word = 'NUM'
    return word

def read_file(f):
    global tag_idx, char_idx
    words, tags, X, Y = [], [], [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            continue
        else:
            ls = line.split(' ')
            if len(ls) > 1:
                word, tag = ls[0],ls[-1]
            else:
                word = ls[0]
                tag = 'O'
            word = process_word(word)
            if len(word) < 1:
                continue
            char_ids = []
            for c in word:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
                char_ids.append(char2idx[c])
            words += [word]
            tags += [tag]
            X.append(char_ids)
            if tag not in tag2idx:
                tag2idx[tag] = tag_idx
                tag_idx += 1
            Y.append(tag2idx[tag])
                        
    return words, tags, X, np.array(Y)

In [5]:
words, _, _, Y = read_file(texts)

In [6]:
bow = CountVectorizer(ngram_range=(1, 1), analyzer='char').fit(words)
tfidf = TfidfVectorizer(ngram_range=(1, 1), analyzer='char').fit(words)

In [7]:
X = np.hstack([bow.transform(words).todense(),tfidf.transform(words).todense()])

In [8]:
mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(X, Y)
predicted = mod_huber.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))



accuracy validation set:  0.7716606498194946
             precision    recall  f1-score   support

        FAC       0.00      0.00      0.00       405
        PRN       0.78      0.99      0.87      9275
        ART       0.56      0.13      0.21      1190
        DOC       0.00      0.00      0.00       130
       NORP       0.00      0.00      0.00       304
          O       0.75      0.03      0.05       106
        PAD       0.89      0.08      0.14       106
        LAW       0.73      0.04      0.08       514
      EVENT       0.00      0.00      0.00        64
       TIME       0.00      0.00      0.00        93
        ORG       0.00      0.00      0.00         1

avg / total       0.69      0.77      0.69     12188



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [10]:
results = []
for i in range(1,5):
    results.append(CountVectorizer(ngram_range=(1, i), analyzer='char').fit_transform(words).todense())
    
X = np.hstack(results)
X.shape

(12188, 12025)

In [11]:
mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(X, Y)
predicted = mod_huber.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))



accuracy validation set:  0.8959632425336397
             precision    recall  f1-score   support

        FAC       0.76      0.74      0.75       405
        PRN       0.93      0.96      0.95      9275
        ART       0.85      0.76      0.81      1190
        DOC       0.95      0.80      0.87       130
       NORP       0.52      0.64      0.57       304
          O       0.97      0.54      0.69       106
        PAD       0.71      0.39      0.50       106
        LAW       0.65      0.61      0.63       514
      EVENT       0.85      0.53      0.65        64
       TIME       0.89      0.54      0.67        93
        ORG       0.00      0.00      0.00         1

avg / total       0.90      0.90      0.89     12188



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)


In [12]:
results = []
for i in range(1,5):
    results.append(TfidfVectorizer(ngram_range=(1, i), analyzer='char').fit_transform(words).todense())
    
X = np.hstack(results)
X.shape

(12188, 12025)

In [13]:
mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(X, Y)
predicted = mod_huber.predict(X)
print('accuracy validation set: ', np.mean(predicted == Y))

# print scores
print(classification_report(Y, predicted, target_names=tag2idx.keys()))



accuracy validation set:  0.8840662947161142
             precision    recall  f1-score   support

        FAC       0.74      0.61      0.67       405
        PRN       0.89      0.99      0.94      9275
        ART       0.86      0.72      0.78      1190
        DOC       0.96      0.65      0.78       130
       NORP       0.76      0.26      0.38       304
          O       0.96      0.41      0.57       106
        PAD       0.89      0.24      0.37       106
        LAW       0.78      0.46      0.57       514
      EVENT       0.88      0.36      0.51        64
       TIME       0.88      0.47      0.62        93
        ORG       0.00      0.00      0.00         1

avg / total       0.88      0.88      0.87     12188



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
