In [1]:
import re

# Preprocessing

In [2]:
with open('train.txt', 'r', encoding='UTF-8') as f:
    train_sents = []
    for sent in f:
        # Split each sentence and extract each word and its corresponding tagging
        token_tag_list = [i.strip('[]').split('/') for i in sent.strip().split() if i]
        train_sents.append(token_tag_list)
        
        
with open('test.txt', 'r', encoding='UTF-8') as f:
    test_sents = []
    for sent in f:
        # Split each sentence and extract each word and its corresponding tagging
        token_tag_list = [i.strip('[]').split('/') for i in sent.strip().split() if i]
        test_sents.append(token_tag_list)

# Feature function

T(·) is a multi-valued function, it classifies a character into four classifications: number, date, English letter and others (returns 1, 2, 3 and 4, respectively) (Jiang, et al., 2009)

In [3]:
from zhon.hanzi import punctuation

def T(word):
    if word.isdigit() or word in ['零','一','二','三','四','五','六','七','八','九','〇']:
        return '1'
    elif word in ['-', '/', ':','年','月','日','时','分','秒']:
        return '2'
    elif bool(re.match("^[A-Za-z]+$", word)):
        return '3'
    else:
        return '4'

In [4]:
# Example
word = '九〇年代R'
result = []
for w in word:
    result.append(T(w))
result_str = "".join(result)
result_str

'11243'

In [67]:
def word2features(sent, i):
    word = sent[i][0]
    word_T = []
    word_T.append(T(word))
    
    features = {
        'word': word,
        'len(word)': len(word),
        'word.ispunctuation': (word in punctuation)
    }
    if i > 0:
        word1 = sent[i-1][0]
        word_T.append(T(word1))
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.ispunctuation': (word1 in punctuation)
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        word_T.append(T(word2))
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.ispunctuation': (word2 in punctuation)
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        word_T.append(T(word1))
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.ispunctuation': (word1 in punctuation),
        })

    else:
        features['EOS'] = True
    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        word_T.append(T(word2))
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.ispunctuation': (word2 in punctuation)
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]