# Conditional Random Field example for position tagging
By LongGang Pang


In [1]:
import os

In [14]:
def construct_dataset(kind="training"):
    with open("%s_sentence.csv"%kind, "r") as fin:
        sentences = fin.readlines()
    with open("%s_labels.csv"%kind, "r") as fin:
        labels = fin.readlines()
    #print(sentences[-1])
    #print(labels[-1])
    docs = []
    for i in range(len(sentences)):
        texts = []
        istart, iend, _ = labels[i].split('|||')
        istart = int(istart)
        iend = int(iend)
        for j, w in enumerate(sentences[i]):
            is_speaker = "F"
            if j in range(istart, iend):
                is_speaker = 'T'
            texts.append((w, is_speaker))
        docs.append(texts)
    return docs

In [15]:
docs = construct_dataset()

In [16]:
docs[0]

[('凤', 'F'),
 ('姐', 'F'),
 ('因', 'F'),
 ('问', 'F'),
 ('何', 'F'),
 ('事', 'F'),
 ('．', 'F'),
 ('凤', 'T'),
 ('姐', 'T'),
 ('道', 'F'),
 ('：', 'F'),
 ('\n', 'F')]

In [20]:
import nltk
from tqdm import tqdm_notebook
data = []
for i, doc in tqdm_notebook(enumerate(docs)):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])


In [21]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [22]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,
     
    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

In [30]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [121]:
#X_test[0]

In [41]:
y_pred[2]
X_test[2][1][1]

'word.lower=幻'

In [73]:
from IPython.display import HTML as html_print, display

def cstr(s, color='black'):
    #return "<text style=color:{}>{}</text>".format(color, s)
    return "<span style=\"color: #ff0000\">{}</span>".format(s)
    #return "**{}**".format(s)

#left, word, right = 'foo' , 'abc' , 'bar'
#html_print(cstr(' '.join([left, cstr(word, color='red'), right]), color='black') )
from IPython.display import Markdown

In [101]:
# Let's take a look at a random sample in the testing set
def check_result(i=12):
    res = []
    for j, label in enumerate(y_pred[i]):
        word = X_test[i][j][1].split("=")[1]
        if label == 'T':
            res.append(cstr(word, color='red'))
        elif label == 'F':
            res.append(word)
    return len(y_pred[i]), ' '.join(res)

In [107]:
for i in range(100):
    length, res = check_result(i)
    if length > 10 and length < 20:
        display(Markdown(res))
        

<span style="color: #ff0000">周</span> <span style="color: #ff0000">瑞</span> <span style="color: #ff0000">家</span> <span style="color: #ff0000">的</span> 点 头 ． 又 道 ： 


乃 亲 斟 一 斗 为 贺 ． <span style="color: #ff0000">那</span> <span style="color: #ff0000">僧</span> 因 干 过 ， 叹 道 ： 


<span style="color: #ff0000">贾</span> <span style="color: #ff0000">政</span> 听 了 ， 便 和 丫 头 说 ： 


<span style="color: #ff0000">邢</span> <span style="color: #ff0000">夫</span> <span style="color: #ff0000">人</span> 只 怕 他 睡 出 病 来 ， 便 哄 他 道 ： 


<span style="color: #ff0000">尤</span> <span style="color: #ff0000">氏</span> <span style="color: #ff0000">秦</span> <span style="color: #ff0000">氏</span> 未 及 答 话 ， 地 下 <span style="color: #ff0000">贾</span> <span style="color: #ff0000">政</span> 先 就 笑 说 ： 


于 是 <span style="color: #ff0000">凤</span> <span style="color: #ff0000">姐</span> 就 吩 咐 媳 妇 婆 子 们 ： 


<span style="color: #ff0000">凤</span> <span style="color: #ff0000">姐</span> 不 待 说 完 ， 便 答 道 ： 


贾 母 正 说 着 ， 只 见 湘 云 走 来 ， 笑 道 ： 


<span style="color: #ff0000">凤</span> <span style="color: #ff0000">姐</span> 听 了 ， 沉 吟 了 半 日 ， 因 向 凤 姐 儿 说 ： 


未 及 说 完 ， <span style="color: #ff0000">水</span> <span style="color: #ff0000">溶</span> 喝 道 ： 


金 氏 去 后 ， 贾 政 方 过 来 坐 下 ， 问 尤 氏 道 ： 


<span style="color: #ff0000">宝</span> <span style="color: #ff0000">钗</span> 听 了 ， 低 头 想 了 半 日 道 ： 


<span style="color: #ff0000">周</span> <span style="color: #ff0000">瑞</span> <span style="color: #ff0000">家</span> <span style="color: #ff0000">的</span> 忙 携 手 垂 泪 道 ： 


<span style="color: #ff0000">尤</span> <span style="color: #ff0000">氏</span> <span style="color: #ff0000">秦</span> <span style="color: #ff0000">氏</span> 未 及 答 话 ， 地 下 <span style="color: #ff0000">宝</span> <span style="color: #ff0000">玉</span> 先 就 笑 说 ： 


未 及 说 完 ， <span style="color: #ff0000">贾</span> <span style="color: #ff0000">蔷</span> 气 的 喝 命 ： 


<span style="color: #ff0000">宝</span> <span style="color: #ff0000">玉</span> 听 了 ， 笑 向 贾 珍 道 ： 


<span style="color: #ff0000">贾</span> <span style="color: #ff0000">珍</span> 听 了 这 话 ， 便 发 了 兴 头 ， 说 道 ： 


<span style="color: #ff0000">周</span> <span style="color: #ff0000">瑞</span> <span style="color: #ff0000">家</span> <span style="color: #ff0000">的</span> 笑 嘻 嘻 的 说 ： 


连 问 几 声 ， <span style="color: #ff0000">宝</span> <span style="color: #ff0000">玉</span> 睁 眼 说 道 ： 


<span style="color: #ff0000">李</span> <span style="color: #ff0000">嬷</span> <span style="color: #ff0000">嬷</span> 听 了 ， 恍 惚 问 道 ： 


<span style="color: #ff0000">王</span> <span style="color: #ff0000">夫</span> <span style="color: #ff0000">人</span> 听 说 ， 便 回 去 了 ， 又 说 ： 


In [117]:
def construct_testing_dataset():
    from honglou import talks
    x_test = []
    for i, talk in tqdm_notebook(enumerate(talks)):
        ctx = talk['context']
        tokens = [t for t in ctx]
        
        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        doc = tagged
        
        x_test.append(extract_features(doc))

    return x_test

In [118]:
x_test = construct_testing_dataset()

In [120]:
#x_test[0]

In [122]:
y_pred_entire = [tagger.tag(xseq) for xseq in x_test]

In [125]:
y_pred_entire[2]

['T', 'T', 'T', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F']

In [158]:
import numpy as np
def convert_tags_to_indices(tags):
    left_ = False
    res = []
    for t in tags:
        if t == 'T': 
            res.append(1)
        else:
            res.append(0)
    res = np.array(res)
    edges = np.abs(res[1:] - res[:-1])
    a = edges.tolist()
    if len(res) > 0:
        a.insert(0, res[0])
        indices = []
        for i, t in enumerate(a):
            if t == 1:
                indices.append(i)
        pairs = []
        for i in range(len(indices)//2):
            pairs.append([indices[2*i], indices[2*i+1]])
    else:
        pairs = []
    return pairs

convert_tags_to_indices(y_pred_entire[6])

[[0, 2], [10, 12], [56, 58]]

In [162]:
from honglou import talks
with open("res_crf.txt", "w") as fout:
    for i, talk in tqdm_notebook(enumerate(talks)):
        ctx = talk['context']
        tags = y_pred_entire[i]
        indicies = convert_tags_to_indices(tags)
        fout.write(ctx)
        fout.write('  |||  ')
        try:
            for index in indicies:
                    istart = index[0]
                    iend = index[1]
                    fout.write("%s"%ctx[istart:iend])    
        except:
            print(indicies)
            pass

        fout.write('\n')
        #print(talk["context"], " ||| ", predictions["%s"%i])