In [1]:
import re
from zhon.hanzi import punctuation #chinese punctuation

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Preprocessing

In [2]:
with open('train.txt', 'r', encoding='UTF-8') as f:
    train_sents = []
    for sent in f:
        # Split each sentence and extract each word and its corresponding tagging
        token_tag_list = [i.strip('[]').split('/') for i in sent.strip().split() if i]
        # Remove NER tagging embedded in pos of tagging
        token_tag_list = [[i[0],i[1].split(']')[0]] for i in token_tag_list]
        token_tag_list = token_tag_list[1:] # delete the date
        train_sents.append(token_tag_list)
        
with open('test.txt', 'r', encoding='UTF-8') as f:
    test_sents = []
    for sent in f:
        # Split each sentence and extract each word and its corresponding tagging
        token_tag_list = [i.strip('[]').split('/') for i in sent.strip().split() if i]
        # Remove NER tagging embedded in pos of tagging
        token_tag_list = [[i[0],i[1].split(']')[0]] for i in token_tag_list]
        token_tag_list = token_tag_list[1:] # delete the date
        test_sents.append(token_tag_list)

# Feature function

T(·) is a multi-valued function, it classifies a character into four classifications: number, date, English letter and others (returns 1, 2, 3 and 4, respectively) (Jiang, et al., 2009)

In [3]:
def T(word):
    if word.isdigit() or word in ['零','一','二','三','四','五','六','七','八','九','〇']:
        return '1'
    elif word in ['-', '/', ':','年','月','日','时','分','秒']:
        return '2'
    elif bool(re.match("^[A-Za-z]+$", word)):
        return '3'
    else:
        return '4'

In [4]:
# Example
word = '九〇年代R'
result = []
for w in word:
    result.append(T(w))
result_str = "".join(result)
result_str

'11243'

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    word_T = []
    word_T.append(T(word))
    
    features = {
        'word': word,
        'len(word)': len(word),
        'word.ispunctuation': (word in punctuation)
    }
    if i > 0:
        word1 = sent[i-1][0]
        word_T.append(T(word1))
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.ispunctuation': (word1 in punctuation)
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        word_T.append(T(word2))
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.ispunctuation': (word2 in punctuation)
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        word_T.append(T(word1))
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.ispunctuation': (word1 in punctuation),
        })

    else:
        features['EOS'] = True
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        word_T.append(T(word2))
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.ispunctuation': (word2 in punctuation)
        })
        
    result_str = "".join(word_T)
    features.update({'multi_value': result_str})

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]

In [6]:
Xtrain = [sent2features(s) for s in train_sents]
ytrain = [sent2labels(s) for s in train_sents]

Xtest = [sent2features(s) for s in test_sents]
ytest = [sent2labels(s) for s in test_sents]

# Training the Model

In [7]:
%%time                                  
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.25,
    c2 = 0.3,
    max_iterations = 100,
    all_possible_transitions=True
)
crf.fit(Xtrain, ytrain)  

CPU times: total: 8min 26s
Wall time: 8min 27s


# Evaluation

In [8]:
#obtaining metrics such as accuracy, etc. on the train set
labels = list(crf.classes_)
ypred = crf.predict(Xtrain)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))
print('Train set classification report: \n\n{}'.format(metrics.flat_classification_report(
ytrain, ypred, labels=labels, digits=3
)))

F1 score on the train set = 0.9788501852849337

Accuracy on the train set = 0.9789962409404661



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train set classification report: 

              precision    recall  f1-score   support

           r      0.997     0.993     0.995     25929
          Ng      0.974     0.931     0.952      3604
           p      0.970     0.979     0.974     32083
          nt      0.994     0.977     0.985      2846
           w      1.000     1.000     1.000    138463
           u      0.997     0.999     0.998     60070
          vn      0.918     0.896     0.907     34511
           v      0.966     0.967     0.966    147983
           t      0.995     0.987     0.991     16656
           f      0.991     0.988     0.990     13791
           n      0.973     0.993     0.983    189403
           m      0.995     0.991     0.993     32917
           q      0.989     0.993     0.991     19237
           l      0.969     0.952     0.960      4849
           a      0.961     0.954     0.958     27723
           b      0.987     0.953     0.970      7018
           c      0.980     0.970     0.975   

  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#obtaining metrics such as accuracy, etc. on the test set
ypred = crf.predict(Xtest)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(ytest, ypred,
average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(ytest, ypred)))

print('Test set classification report: \n\n{}'.format(metrics.flat_classification_report(ytest, ypred, labels=labels, digits=3)))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


F1 score on the test set = 0.9452930520884762

Accuracy on the test set = 0.9461458464076312



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test set classification report: 

              precision    recall  f1-score   support

           r      0.996     0.980     0.988      6398
          Ng      0.925     0.777     0.844       958
           p      0.932     0.949     0.940      7842
          nt      0.988     0.920     0.953       727
           w      1.000     1.000     1.000     34584
           u      0.993     0.997     0.995     14759
          vn      0.847     0.810     0.828      8223
           v      0.918     0.930     0.924     36781
           t      0.984     0.955     0.969      4024
           f      0.973     0.968     0.971      3410
           n      0.918     0.975     0.946     47407
           m      0.981     0.966     0.974      8453
           q      0.969     0.979     0.974      5007
           l      0.863     0.664     0.750      1166
           a      0.924     0.874     0.898      6750
           b      0.970     0.832     0.896      1697
           c      0.954     0.935     0.945    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Obtaining Transitions

In [10]:
#obtaining the most likely and the least likely transitions 
from collections import Counter

def print_transitions(transition_features):
    for (label_from, label_to), weight in transition_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top 10 likely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 unlikely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top 10 likely transitions - 

nr     -> nr      5.153766
m      -> q       3.674815
y      -> w       3.176700
Tg     -> Tg      3.157888
Ag     -> Ng      2.948003
ad     -> v       2.721086
Vg     -> Ng      2.692685
d      -> Vg      2.603138
Ag     -> Vg      2.601285
an     -> an      2.470922

Top 10 unlikely transitions - 

l      -> q       -2.203911
vd     -> vn      -2.246065
ad     -> m       -2.251487
i      -> t       -2.280980
y      -> m       -2.317202
d      -> an      -2.386700
i      -> q       -2.435754
ad     -> u       -2.682211
ad     -> vn      -3.345800
ad     -> n       -3.603297
