In [2]:
import os
from urllib.request import urlretrieve
import zipfile
import glob

if not os.path.exists('data'):
    os.makedirs('data')
    
# Download data
url ='https://storage.googleapis.com/kaggle-datasets/1014/4361/entity-annotated-corpus.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1567749367&Signature=X%2FhIcHdTAMR%2F1SVt%2F3uiJN4deGEMy82PuQaEuezIVz1AahALWq47CS1m06eSlYClHYIc5SvmLCid6pb%2FzRr%2BTg1E2ogs2oi85EdqpUXVgk9G0boIFMIhZGSfUe%2Bg8eWjjGSGEp%2FKZbxd75myX3BgHInMYnr5IYl%2FOTbk%2BLvEpDbzIApolLprkeFryaX19yiw%2B9r0KfjdraczmSK0UTbXAaNYwYHjvX3CFW6ExYJwzT0zGK0i5PAYcFxp68hwTMrGJN6jiFJqMrqHpO6tR4DyIYd1pn79JwpbpAiE2SukZK1vddD3SNAOVV9VlBBakvcNZLCxUaL3%2BtncYWpmCgtBog%3D%3D'

urlretrieve(url, 'data/kaggle_ner.zip')

with zipfile.ZipFile('data/kaggle_ner.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')
    
import glob

glob.glob('data/*')


['data/ner.csv', 'data/ner_dataset.csv', 'data/kaggle_ner.zip']

In [6]:
# !pip install pandas
import pandas as pd    
import numpy as np

data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

In [15]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [16]:
data = data.fillna(method="ffill")
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [17]:
words = list(set(data["Word"].values))
len(words)

35178

So we have 35178 total different words

In [18]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()    
        except:
            self.empty = True
            return None, None, None

getter = SentenceGetter(data)

sent, pos, tag = getter.get_next()

In [19]:
sent, pos, tag

(['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 ['NNS',
  'IN',
  'NNS',
  'VBP',
  'VBN',
  'IN',
  'NNP',
  'TO',
  'VB',
  'DT',
  'NN',
  'IN',
  'NNP',
  'CC',
  'VB',
  'DT',
  'NN',
  'IN',
  'JJ',
  'NNS',
  'IN',
  'DT',
  'NN',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'])

### A first idea: Memorization

The first simple idea and baseline might be to just remember the most common named entity for every word and predict that. In case we don’t know a word we just predict ‘O’. The following class does that. I implement it inheriting from a scikit-learn base classes to use the class with the inbuilt cross-validation.

In [21]:
# !pip install sklearn
from sklearn.base import BaseEstimator, TransformerMixin


class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]
    
tagger = MemoryTagger()
tagger.fit(sent, tag)
print(tagger.predict(sent))

Collecting sklearn
Collecting scikit-learn (from sklearn)
  Using cached https://files.pythonhosted.org/packages/e9/57/8a9889d49d0d77905af5a7524fb2b468d2ef5fc723684f51f5ca63efed0d/scikit_learn-0.21.3-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl
Collecting scipy>=0.17.0 (from scikit-learn->sklearn)
  Using cached https://files.pythonhosted.org/packages/d5/06/1a696649f4b2e706c509cb9333fdc6331fbe71251cede945f9e1fa13ea34/scipy-1.3.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl
Collecting joblib>=0.11 (from scikit-learn->sklearn)
  Using cached https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl
Installing collected packages: scipy, joblib, scikit-learn, sklearn
Successfully installed joblib-0.13.2 scikit-learn-0.21.3 scipy-1.3.1 sklearn-0.0
['O', 'O', 'O', 'O', 'O', 'O', 'B-g

Looks like it works. Now we do a 5-fold cross-validation.

In [23]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

    accuracy              

### A simple machine learning approach

To do machine learning, we convert the data to a simple feature vector for every word and then use a random forest to classify the words.


In [24]:
from sklearn.ensemble import RandomForestClassifier

def feature_map(word):
    '''Simple feature map.'''
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

words = [feature_map(w) for w in data["Word"].values.tolist()]

pred = cross_val_predict(RandomForestClassifier(n_estimators=20),
                         X=words, y=tags, cv=5)

report = classification_report(y_pred=pred, y_true=tags)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.79      0.40     37644
       B-gpe       0.26      0.07      0.11     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.46      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

Wow, that looks really bad. This is expected, since the features lack a lot of information necessary for the decision. So now we enhance our simple features on the one hand by memory and on the other hand by using context information.

In [25]:
from sklearn.preprocessing import LabelEncoder

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        self.pos = X["POS"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    def transform(self, X, y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                 pos_default(p), wp, wm, posp, posm]))
        return out

In [26]:
from sklearn.pipeline import Pipeline

pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data, y=tags, cv=5)

report = classification_report(y_pred=pred, y_true=tags)
print(report)

exception calling callback for <Future at 0x160fdcdd0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-packages/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next()
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-packages/joblib/parallel.py", line 731, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-packages/joblib/parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-packages/joblib/parallel.py", line 716, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Volumes/Data/ethan/.env_tf20_p37/lib/python3.7/site-package

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {EXIT(1), EXIT(1)}