### Named Entity Recognition and Classification with Scikit-Learn

https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('dataframe_ADE.csv', encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence_ID,Token_ID,BIO,POS,Token,ID
0,0,Sentence: 1,1,O,JJ,Gemcitabine-related,0
1,1,Sentence: 1,2,B_AdverseReaction,NN,radiation,1
2,2,Sentence: 1,3,I_AdverseReaction,NN,recall,2
3,3,Sentence: 1,4,O,IN,in,3
4,4,Sentence: 1,5,O,DT,a,4


In [4]:
df.isnull().sum()

Unnamed: 0     0
Sentence_ID    0
Token_ID       0
BIO            0
POS            0
Token          0
ID             0
dtype: int64

In [5]:
df['Sentence_ID'].nunique(), df.Token.nunique(), df.BIO.nunique()

(4271, 9211, 3)

In [6]:
df.groupby('BIO').size().reset_index(name='counts')

Unnamed: 0,BIO,counts
0,B_AdverseReaction,5749
1,I_AdverseReaction,6529
2,O,73832


In [7]:
X = df.drop('BIO', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.BIO.values


classes = np.unique(y)
classes = classes.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

((57693, 13527), (57693,))

In [9]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


-- Epoch 1-- Epoch 1-- Epoch 1


Norm: 7027.99, NNZs: 5199, Bias: 71.000000, T: 57693, Avg. loss: 336213515.283709Norm: 59866.79, NNZs: 5242, Bias: 42.000000, T: 57693, Avg. loss: 310400211.228624
Total training time: 1.73 seconds.

Total training time: 1.72 seconds.
Norm: 66149.12, NNZs: 6908, Bias: -122.000000, T: 57693, Avg. loss: 599241534.155842
Total training time: 1.93 seconds.


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished


Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=5, n_iter=None, n_iter_no_change=5,
      n_jobs=-1, penalty=None, random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=10, warm_start=False)

Because tag “O” (outside) is the most common tag and it will make our results look much better than they actual are. So we remove tag “O” when we evaluate classification metrics.

In [16]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B_AdverseReaction', 'I_AdverseReaction']

In [11]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

                   precision    recall  f1-score   support

B_AdverseReaction       0.00      0.00      0.00      1857
I_AdverseReaction       0.08      1.00      0.14      2202

        micro avg       0.08      0.54      0.14      4059
        macro avg       0.04      0.50      0.07      4059
     weighted avg       0.04      0.54      0.08      4059



  'precision', 'predicted', average, warn_for)


In [8]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [9]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Token'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['BIO'].values.tolist())]
        self.grouped = self.data.groupby('Sentence_ID').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [10]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [11]:
sentences[0]

[('Gemcitabine-related', 'JJ', 'O'),
 ('radiation', 'NN', 'B_AdverseReaction'),
 ('recall', 'NN', 'I_AdverseReaction'),
 ('in', 'IN', 'O'),
 ('a', 'DT', 'O'),
 ('patient', 'NN', 'O'),
 ('with', 'IN', 'O'),
 ('pancreatic', 'JJ', 'O'),
 ('cancer', 'NN', 'O'),
 ('.', '.', 'O')]

Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklearn-crfsuite format — each sentence should be converted to a list of dicts. The following code were taken from sklearn-crfsuites official site.

In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

Split train and test sets

In [13]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Train a CRF model

In [14]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

Evaluation

In [17]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

                   precision    recall  f1-score   support

B_AdverseReaction       0.77      0.73      0.75      1139
I_AdverseReaction       0.79      0.73      0.76      1250

        micro avg       0.78      0.73      0.76      2389
        macro avg       0.78      0.73      0.76      2389
     weighted avg       0.78      0.73      0.76      2389



What our classifier learned?

In [22]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(4))
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-5:])

Top likely transitions:
O      -> O       2.260939
I_AdverseReaction -> I_AdverseReaction 1.529851
B_AdverseReaction -> I_AdverseReaction 1.017486
O      -> B_AdverseReaction 0.142104

Top unlikely transitions:
I_AdverseReaction -> O       -1.182713
B_AdverseReaction -> O       -1.702729
I_AdverseReaction -> B_AdverseReaction -4.136578
B_AdverseReaction -> B_AdverseReaction -5.086790
O      -> I_AdverseReaction -9.109015


Interpretation: It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely.

Check the state features

In [23]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.122251 B_AdverseReaction word.lower():eruption
4.989512 O        BOS
4.693180 B_AdverseReaction word.lower():teratogenic
4.550998 B_AdverseReaction -1:word.lower():insulin-induced
4.441654 B_AdverseReaction word.lower():vomiting
4.257753 B_AdverseReaction word.lower():delirium
4.172843 B_AdverseReaction +1:word.lower():formation
4.006873 B_AdverseReaction word.lower():agitated
3.894396 B_AdverseReaction word.lower():asystole
3.886085 B_AdverseReaction word.lower():hepatotoxicity
3.842604 B_AdverseReaction BOS
3.823347 B_AdverseReaction word.lower():tics
3.763621 B_AdverseReaction word.lower():nephrotoxicity
3.741356 B_AdverseReaction word.lower():confusion
3.541060 O        -1:word.lower():treat
3.457861 B_AdverseReaction word.lower():discomfort
3.389945 B_AdverseReaction word.lower():tremor
3.385782 B_AdverseReaction word.lower():jaundice
3.380567 B_AdverseReaction word.lower():cardiotoxicity
3.336620 O        word.lower():leukemia
3.287390 B_AdverseReaction word.lower

3.370614 B-per word.lower():president The model learns that token "president" is likely to be at the beginning of a person name.

In [24]:
import eli5
eli5.show_weights(crf, top=10)