# Synonym-based, Rule-based and CRF approaches for NER 

In [1]:
from collections import Counter

import nltk

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import datasets

## 0. Load the data

In [2]:
train_data = datasets.parse_NCBI_disease_corpus('data/NCBItrainset_corpus.txt')
test_data = datasets.parse_NCBI_disease_corpus('data/NCBItestset_corpus.txt')
print (len(train_data), len(test_data))

593 100


In [3]:
train_data[0]

('A common human skin tumour is caused by activating mutations in beta-catenin.\nWNT signalling orchestrates a number of developmental programs. In response to this stimulus, cytoplasmic beta-catenin (encoded by CTNNB1) is stabilized, enabling downstream transcriptional activation by members of the LEF/TCF family. One of the target genes for beta-catenin/TCF encodes c-MYC, explaining why constitutive activation of the WNT pathway can lead to cancer, particularly in the colon. Most colon cancers arise from mutations in the gene encoding adenomatous polyposis coli (APC), a protein required for ubiquitin-mediated degradation of beta-catenin, but a small percentage of colon and some other cancers harbour beta-catenin-stabilizing mutations. Recently, we discovered that transgenic mice expressing an activated beta-catenin are predisposed to developing skin tumours resembling pilomatricomas. Given that the skin of these adult mice also exhibits signs of de novo hair-follicle morphogenesis, we

## 1. Word tokenization

[Word tokenization refresher](https://github.com/wangz10/text-classification/blob/master/Main.ipynb)

In [4]:
tokens = nltk.word_tokenize(train_data[0][0])
print (tokens)

['A', 'common', 'human', 'skin', 'tumour', 'is', 'caused', 'by', 'activating', 'mutations', 'in', 'beta-catenin', '.', 'WNT', 'signalling', 'orchestrates', 'a', 'number', 'of', 'developmental', 'programs', '.', 'In', 'response', 'to', 'this', 'stimulus', ',', 'cytoplasmic', 'beta-catenin', '(', 'encoded', 'by', 'CTNNB1', ')', 'is', 'stabilized', ',', 'enabling', 'downstream', 'transcriptional', 'activation', 'by', 'members', 'of', 'the', 'LEF/TCF', 'family', '.', 'One', 'of', 'the', 'target', 'genes', 'for', 'beta-catenin/TCF', 'encodes', 'c-MYC', ',', 'explaining', 'why', 'constitutive', 'activation', 'of', 'the', 'WNT', 'pathway', 'can', 'lead', 'to', 'cancer', ',', 'particularly', 'in', 'the', 'colon', '.', 'Most', 'colon', 'cancers', 'arise', 'from', 'mutations', 'in', 'the', 'gene', 'encoding', 'adenomatous', 'polyposis', 'coli', '(', 'APC', ')', ',', 'a', 'protein', 'required', 'for', 'ubiquitin-mediated', 'degradation', 'of', 'beta-catenin', ',', 'but', 'a', 'small', 'percentage

## 2. Part-of-Speech (POS) Tagging

Label a sentence (a sequence of words or tokens) with tags like ADJECTIVE, NOUN, PREPOSITION, VERB, ADVERB and etc.

In [5]:
pos_tags = nltk.pos_tag(tokens[:13])
pos_tags

[('A', 'DT'),
 ('common', 'JJ'),
 ('human', 'JJ'),
 ('skin', 'FW'),
 ('tumour', 'NN'),
 ('is', 'VBZ'),
 ('caused', 'VBN'),
 ('by', 'IN'),
 ('activating', 'VBG'),
 ('mutations', 'NNS'),
 ('in', 'IN'),
 ('beta-catenin', 'NN'),
 ('.', '.')]

**POS tag list**:

`
CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
`

In [6]:
# Convert data to annotated tokens
train_data_annot = datasets.convert_to_annot_tokens(train_data)
test_data_annot = datasets.convert_to_annot_tokens(test_data)

In [7]:
train_data_annot[0]

[('A', 'DT', 'IR'),
 ('common', 'JJ', 'IR'),
 ('human', 'JJ', 'IR'),
 ('skin', 'FW', 'IR'),
 ('tumour', 'NN', 'Modifier'),
 ('is', 'VBZ', 'IR'),
 ('caused', 'VBN', 'IR'),
 ('by', 'IN', 'IR'),
 ('activating', 'VBG', 'IR'),
 ('mutations', 'NNS', 'IR'),
 ('in', 'IN', 'IR'),
 ('beta-catenin', 'NN', 'IR'),
 ('.', '.', 'IR'),
 ('WNT', 'NNP', 'IR'),
 ('signalling', 'VBG', 'IR'),
 ('orchestrates', 'VBZ', 'IR'),
 ('a', 'DT', 'IR'),
 ('number', 'NN', 'IR'),
 ('of', 'IN', 'IR'),
 ('developmental', 'JJ', 'IR'),
 ('programs', 'NNS', 'IR'),
 ('.', '.', 'IR'),
 ('In', 'IN', 'IR'),
 ('response', 'NN', 'IR'),
 ('to', 'TO', 'IR'),
 ('this', 'DT', 'IR'),
 ('stimulus', 'NN', 'IR'),
 (',', ',', 'IR'),
 ('cytoplasmic', 'JJ', 'IR'),
 ('beta-catenin', 'NN', 'IR'),
 ('(', '(', 'IR'),
 ('encoded', 'VBN', 'IR'),
 ('by', 'IN', 'IR'),
 ('CTNNB1', 'NNP', 'IR'),
 (')', ')', 'IR'),
 ('is', 'VBZ', 'IR'),
 ('stabilized', 'VBN', 'IR'),
 (',', ',', 'IR'),
 ('enabling', 'VBG', 'IR'),
 ('downstream', 'JJ', 'IR'),
 ('transc

## 1. Synonym dictionary for NER

Use a dictionary to store every entities and their types encountered in the training set. Then do lookups in the prediction phase.

In [8]:
d_train = {}
counter = Counter()
for tokens in train_data_annot:
    for token, pos, ent_type in tokens:
        if ent_type != 'IR':
            d_train[token.lower()] = ent_type
            counter.update([(token, ent_type)])
            
print (len(d_train))

369


In [9]:
counter.most_common(20)

[(('APC', 'Modifier'), 117),
 (('DMD', 'Modifier'), 73),
 (('DM', 'Modifier'), 68),
 (('DM', 'SpecificDisease'), 57),
 (('tumors', 'DiseaseClass'), 53),
 (('cancer', 'Modifier'), 50),
 (('VHL', 'Modifier'), 49),
 (('PWS', 'SpecificDisease'), 48),
 (('cancer', 'DiseaseClass'), 46),
 (('ALD', 'SpecificDisease'), 46),
 (('ALD', 'Modifier'), 41),
 (('TSD', 'Modifier'), 34),
 (('WAS', 'SpecificDisease'), 34),
 (('tumor', 'Modifier'), 32),
 (('DMD', 'SpecificDisease'), 29),
 (('WAS', 'Modifier'), 29),
 (('aniridia', 'SpecificDisease'), 29),
 (('HD', 'SpecificDisease'), 28),
 (('FAP', 'Modifier'), 27),
 (('PKU', 'Modifier'), 27)]

In [10]:
# Extract the true labels for the test set
y_test = [[item[2] for item in tokens] for tokens in test_data_annot]

In [11]:
# make prediction using the synonym dictionary
y_test_pred_synonyms = []
for tokens in test_data_annot:
    preds = [d_train.get(token.lower(), 'IR') for token, pos, _ in tokens]
    y_test_pred_synonyms.append(preds)

In [12]:
labels = sorted(set(d_train.values()))
f1_score = metrics.flat_f1_score(y_test, y_test_pred_synonyms, 
                                 average='weighted', labels=labels)
print (f1_score)

0.36940441457101464


In [13]:
print(metrics.flat_classification_report(
    y_test, y_test_pred_synonyms, labels=labels, digits=4
))

                 precision    recall  f1-score   support

   DiseaseClass     0.2404    0.7213    0.3607        61
       Modifier     0.2148    0.4741    0.2957       251
SpecificDisease     0.5246    0.3887    0.4465       247

    avg / total     0.3545    0.4633    0.3694       559



## 2. Mannually extract features for tokens for rule-based and Machine Learning based NERs

Features:
- word identity
- word suffix
- word shape 
- POS tag
- features from surrounding words 


In [14]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [15]:
sent2features(train_data_annot[0])[0]

{'bias': 1.0,
 'word.lower()': 'a',
 'word[-3:]': 'A',
 'word[-2:]': 'A',
 'word.isupper()': True,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'DT',
 'postag[:2]': 'DT',
 'BOS': True,
 '+1:word.lower()': 'common',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'JJ',
 '+1:postag[:2]': 'JJ'}

In [16]:
X_train = [sent2features(s) for s in train_data_annot]
y_train = [sent2labels(s) for s in train_data_annot]

X_test = [sent2features(s) for s in test_data_annot]
y_test = [sent2labels(s) for s in test_data_annot]

## 3. Rule-based NER

In [17]:
# Define some simple rules: 
def rule1(features):
    '''features is a list features corresponding to tokens in a doc.
    Predict any pos-tag NN* to be a SpecificDisease        
    '''
    preds = [None] * len(features)
    for i in range(len(features)):
        if features[i]['postag'].startswith('NN'):
            preds[i] = 'SpecificDisease'
        else:
            preds[i] = 'IR'
    return preds

def rule2(features):
    '''Predict any pos-tag NN* following a determiner/adjectives/noun to be a SpecificDisease. 
    '''
    preds = [None] * len(features)
    for i in range(len(features)):
        if features[i]['postag'].startswith('NN') and \
            features[i].get('-1:postag', 'NN') in ('DT', 'JJ', 'NN'): # determiner, adjectives and noun
            preds[i] = 'SpecificDisease'
        else:
            preds[i] = 'IR'
    return preds

In [18]:
y_pred_rule1 = [rule1(x_test) for x_test in X_test]
print(metrics.flat_classification_report(
    y_test, y_pred_rule1, labels=labels, digits=4
))

                 precision    recall  f1-score   support

   DiseaseClass     0.0000    0.0000    0.0000        61
       Modifier     0.0000    0.0000    0.0000       251
SpecificDisease     0.0325    0.9676    0.0629       247

    avg / total     0.0144    0.4275    0.0278       559



  'precision', 'predicted', average, warn_for)


In [19]:
y_pred_rule2 = [rule2(x_test) for x_test in X_test]
print(metrics.flat_classification_report(
    y_test, y_pred_rule2, labels=labels, digits=4
))

                 precision    recall  f1-score   support

   DiseaseClass     0.0000    0.0000    0.0000        61
       Modifier     0.0000    0.0000    0.0000       251
SpecificDisease     0.0165    0.2713    0.0312       247

    avg / total     0.0073    0.1199    0.0138       559



  'precision', 'predicted', average, warn_for)


## 4. Conditional random fields (CRFs)

- A type of discriminative undirected probabilistic graphical model
![](https://i.stack.imgur.com/khcnl.png)


(grey: X, white: Y)

- A sequence of Logistic Regression models that uses the features and labels from surrounding tokens to predict the label for a given token.

In [20]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [21]:
y_pred_crf = crf.predict(X_test)
f1_score = metrics.flat_f1_score(y_test, y_pred_crf,
                                 average='weighted', labels=labels)
print (f1_score)

0.5161013019726128


In [22]:
print(metrics.flat_classification_report(
    y_test, y_pred_crf, labels=labels, digits=4
))

                 precision    recall  f1-score   support

   DiseaseClass     0.4872    0.3115    0.3800        61
       Modifier     0.6462    0.5020    0.5650       251
SpecificDisease     0.5989    0.4291    0.5000       247

    avg / total     0.6079    0.4490    0.5161       559

