# `N`amed `E`ntity `R`ecognition

In [1]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')  # 不打印警告

# 数据

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
388568,,impact,NN,O
388569,,.,.,O
388570,Sentence: 47959,Indian,JJ,B-gpe
388571,,forces,NNS,O
388572,,said,VBD,O
388573,,they,PRP,O
388574,,responded,VBD,O
388575,,to,TO,O
388576,,the,DT,O
388577,,attack,NN,O


In [3]:
df_train = df_train.fillna(method='ffill')
df_test = df_test.fillna(method='ffill')
df_train.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
659992,Sentence: 30159,control,VB,O
659993,Sentence: 30159,the,DT,O
659994,Sentence: 30159,island,NN,O
659995,Sentence: 30159,chain,NN,O
659996,Sentence: 30159,.,.,O


In [5]:
labels = df_train.Tag.unique().tolist()
labels.remove('O')
df_train.groupby('Tag').size().reset_index(name='count')

Unnamed: 0,Tag,count
0,B-art,260
1,B-eve,218
2,B-geo,23605
3,B-gpe,10045
4,B-nat,121
5,B-org,12507
6,B-per,10681
7,B-tim,12756
8,I-art,197
9,I-eve,177


In [6]:
y_train, y_test = df_train.Tag.values, df_test.Tag.values
print(y_train.shape, y_test.shape)

(659997,) (388578,)


# 模型

**多数票决**

In [7]:
X_train = df_train.Word.tolist()
X_test = df_test.Word.tolist()

In [8]:
class Majority_vote: 

    def fit(self, X, y):
        counter = {}
        for w, t in zip(X, y):
            if w in counter:
                if t in counter[w]:
                    counter[w][t] += 1
                else:
                    counter[w][t] = 1
            else:
                counter[w] = {t: 1}
        self.vote = {}
        for w, t in counter.items():
            self.vote[w] = max(t, key=t.get)
        return self

    def predict(self, X):
        return [self.vote.get(x, 'O') for x in X]

In [9]:
clf = Majority_vote().fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [10]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, labels)
print(report)

             precision    recall  f1-score   support

      B-geo       0.79      0.85      0.82     14039
      B-gpe       0.94      0.95      0.95      5825
      B-per       0.79      0.63      0.70      6309
      I-geo       0.70      0.61      0.65      2707
      B-org       0.68      0.48      0.57      7636
      I-org       0.71      0.52      0.60      6400
      B-tim       0.87      0.76      0.81      7577
      B-art       0.19      0.06      0.09       142
      I-art       0.03      0.01      0.01       100
      I-per       0.72      0.65      0.68      6251
      I-gpe       0.53      0.65      0.58        65
      I-tim       0.61      0.12      0.20      2403
      B-nat       0.45      0.46      0.45        80
      B-eve       0.61      0.22      0.33        90
      I-eve       0.33      0.09      0.14        76
      I-nat       0.00      0.00      0.00        16

avg / total       0.77      0.68      0.71     59716



**线性链条件随机场**

In [11]:
def groupby(s):
    f = lambda s: [(w, p, t) for w, p, t in zip(
        s.Word.values, s.POS.values, s.Tag.values)]
    return list(s.groupby('Sentence #').apply(f))

In [12]:
df_train = groupby(df_train)
df_test = groupby(df_test)
df_train[:2]

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Iranian', 'JJ', 'B-gpe'),
  ('officials', 'NNS', 'O'),
  ('say', 'VBP', 'O'),
  ('they', 'PRP', 'O'),
  ('expect', 'VBP', 'O'),
  ('to', 'TO', 'O'),
  ('get', 'VB', 'O'),
  ('access', 'NN', 'O'),
  ('to', 'TO', 'O'),
  ('sealed', 'JJ', 'O'),
  ('sensitive', 'JJ', 'O'),
  ('parts', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('plant', 'NN', 'O'),
  ('Wednesday', 'NNP', 'B-tim'),
  ('

In [13]:
class Processing:

    def __init__(self, tpl):
        self.tpl = tpl
        self.len = len(tpl)

    def get_features(self, i):
        word = self.tpl[i][0]
        postag = self.tpl[i][1]

        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'postag': postag,
            'postag[:2]': postag[:2],
        }
        if i > 0:
            word1 = self.tpl[i - 1][0]
            postag1 = self.tpl[i - 1][1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:postag': postag1,
                '-1:postag[:2]': postag1[:2],
            })
        else:
            features['BOS'] = True
        if i < self.len - 1:
            word1 = self.tpl[i + 1][0]
            postag1 = self.tpl[i + 1][1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            features['EOS'] = True

        return features

    def to_labels(self):
        return [l for t, p, l in self.tpl]

    def to_features(self):
        return [self.get_features(i) for i in range(self.len)]

In [14]:
X_train, y_train = [], []
for tpl in df_train:
    tpl = Processing(tpl)
    X_train.append(tpl.to_features())
    y_train.append(tpl.to_labels())
X_test, y_test = [], []
for tpl in df_train:
    tpl = Processing(tpl)
    X_test.append(tpl.to_features())
    y_test.append(tpl.to_labels())

In [15]:
from sklearn_crfsuite import CRF
clf = CRF(
    algorithm='lbfgs',
    c1=.1,
    c2=.1,
    max_iterations=100,
).fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
from sklearn_crfsuite.metrics import flat_classification_report
report = flat_classification_report(y_test, y_pred, labels)
print(report)

             precision    recall  f1-score   support

      B-geo       0.91      0.96      0.93     23605
      B-gpe       0.98      0.95      0.96     10045
      B-per       0.96      0.93      0.94     10681
      I-geo       0.91      0.93      0.92      4707
      B-org       0.92      0.87      0.89     12507
      I-org       0.95      0.94      0.94     10384
      B-tim       0.97      0.93      0.95     12756
      B-art       0.94      0.77      0.85       260
      I-art       0.92      0.81      0.86       197
      I-per       0.95      0.96      0.95     11000
      I-gpe       0.97      0.66      0.79       133
      I-tim       0.94      0.90      0.92      4125
      B-nat       0.84      0.64      0.72       121
      B-eve       0.88      0.78      0.82       218
      I-eve       0.89      0.74      0.81       177
      I-nat       0.89      0.71      0.79        35

avg / total       0.94      0.93      0.94    100951

