安装 pycrfsuite: `pip install python-crfsuite` <br />

### 1. 预处理数据集

人民日报数据集词性标注样例

```
’/w  ９９/m  昆明/ns  世博会/n
```


In [13]:
 # 打开GB2312编码的输入文件并读取内容
def transform_dataset(input_text, output_text):
    with open(input_text, 'r', encoding='gb2312', errors="ignore") as input_file:
        content = input_file.read()

    # 将内容写入UTF-8编码的输出文件
    with open(output_text, 'w', encoding='utf-8') as output_file:
        output_file.write(content)

transform_dataset("train_pd.txt", "train_raw.txt")
transform_dataset("test_pd_src.txt", "test_src.txt")

In [34]:
# 转换训练文本为如下格式：每行一个字，字后面是标签（如词性标注），每个句子之间用空行隔开。
with open("train_raw.txt", 'r', encoding='utf-8') as train_file:
    train_lines = train_file.readlines()

train_data = []

for line in train_lines:
    line = line.strip().split()
    train_data.append(line)
    
for idx in range(0, 5):
    print(train_data[idx])

with open("train.txt", 'w', encoding='utf-8') as train_file, open("test.txt", 'w', encoding='utf-8') as test_file:
  for data in train_data[:1000]:
      for word in data:
          test_file.write(word + '\n')
      test_file.write('\n')

  for data in train_data[1000:]:
      for word in data:
          train_file.write(word + '\n')
      train_file.write('\n')


['’/w', '９９/m', '昆明/ns', '世博会/n', '组委会/j', '秘书长/n', '、/w', '云南省/ns', '副/b', '省长/n', '刘/nr', '京/nr', '介绍/v', '说/v', '，/w', '’/w', '９９/m', '世博会/j', '会址/n', '位于/v', '昆明/ns', '北郊/s', '植被/n', '茂密/a', '的/u', '金殿/ns', '风景/n', '名胜区/n', '，/w', '占地/v', '２０５/m', '公顷/q', '，/w', '总/b', '投资/vn', '１２．４亿/m', '元/q', '人民币/n', '，/w', '主要/d', '包括/v', '５/m', '大/a', '展馆/n', '、/w', '６/m', '个/q', '专题/n', '展园/n', '、/w', '３/m', '大/n', '室外/s', '展区/n', '。/w', '自/p', '去年/t', '５月/t', '’/w', '９９/m', '昆明/ns', '世博会/j', '场馆/n', '建设/vn', '开工/v', '以来/f', '，/w', '在/p', '建设者/n', '们/k', '的/u', '努力/an', '下/f', '，/w', '目前/t', '各项/r', '工程/n', '进展/v', '顺利/a', '。/w', '到/p', '目前/t', '为止/v', '，/w', '已/d', '有/v', '１３３/m', '个/q', '国家/n', '对/p', '昆明/ns', '世博会/j', '作出/v', '了/u', '反应/vn', '，/w', '原则/n', '上/f', '同意/v', '参展/v', '的/u', '国家/n', '有/v', '５３/m', '个/q', '。/w', '为/p', '办/v', '好/a', '’/w', '９９/m', '昆明/ns', '世界/n', '园艺/n', '博览会/n', '，/w', '中国/ns', '政府/n', '专门/d', '在/p', '昆明/ns', '召开/v', '了/u', '国内/s', '组展会/j', '，/w', '要求/v', '全国/

### 2. 统计词性标注

In [35]:
import re
from collections import Counter

def count_pos_tags(text):
    # 使用正则表达式匹配所有的词性标注
    pos_tags = re.findall(r'/([a-zA-Z]+)', text)
    
    # 使用Counter统计每个词性标注的出现次数
    pos_counts = Counter(pos_tags)
    
    return pos_counts

def format_output(pos_counts):
    # 格式化输出结果
    formatted_result = "\n".join([f"{tag}: {count}" for tag, count in pos_counts.items()])
    return formatted_result


with open("train.txt", 'r', encoding='utf-8') as f:
    train_data = f.read()

pos_counts = count_pos_tags(train_data)
formatted_result = format_output(pos_counts)
print(formatted_result)

nz: 20024
w: 919684
ns: 147162
n: 1287829
nx: 2043
j: 63422
vn: 258312
v: 999079
f: 93217
p: 227147
d: 266901
m: 226033
q: 130938
u: 411086
a: 185613
r: 180181
Ng: 25560
b: 47147
nt: 20168
s: 19585
c: 141827
t: 105885
nr: 170313
Vg: 8695
k: 4491
ad: 34676
y: 9868
i: 26148
l: 30415
an: 15615
z: 5873
Tg: 2229
vd: 4537
h: 216
Ag: 1685
o: 318
Dg: 406
e: 137
Rg: 49
Mg: 46
Bg: 10
Yg: 3


In [39]:
import pycrfsuite
from sklearn.metrics import classification_report
import json

# 加载数据集
def load_data(file_path, split='/'):
    sentences = []
    sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():
                char, label = line.strip().split(split)
                if len(char) != 0 and len(label) != 0:
                    sentence.append((char, label))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    if sentence:
        sentences.append(sentence)
    return sentences

# 特征提取
def extract_features(sentence, i):
    word = sentence[i][0]
    try:
        features = {
            'bias': 1.0,
            'word': word,
            'is_first': i == 0,
            'is_last': i == len(sentence) - 1,
            'is_digit': word.isdigit(),
            'prefix-1': word[0],
            'suffix-1': word[-1],
        }
        if i > 0:
            word1 = sentence[i - 1][0]
            features.update({
                '-1:word': word1,
                '-1:is_digit': word1.isdigit(),
            })
        else:
            features['BOS'] = True

        if i < len(sentence) - 1:
            word1 = sentence[i + 1][0]
            features.update({
                '+1:word': word1,
                '+1:is_digit': word1.isdigit(),
            })
        else:
            features['EOS'] = True
    except Exception as e:
        print(i, ':', sentence[i])
        print(f"Error: {e}")

    return features

def sent2features(sentence):
    return [extract_features(sentence, i) for i in range(len(sentence))]

def sent2labels(sentence):
    return [label for token, label in sentence]

# 加载训练和测试数据
train_sents = load_data('train.txt')
test_sents = load_data('test.txt')
print(train_sents[:5])
# 打印数据集大小
print(f"训练集大小: {len(train_sents)}")
print(f"测试集大小: {len(test_sents)}")

# 提取特征和标签
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


print(json.dumps(X_train[:5], indent=2, ensure_ascii=False))
print(json.dumps(y_train[:5], indent=2, ensure_ascii=False))

[[('光男', 'nz'), ('（', 'w'), ('湖南', 'ns'), ('）', 'w'), ('体育', 'n'), ('用品', 'n'), ('有限公司', 'n'), ('羽毛球拍', 'n'), ('ＫＥＮＮＥＸ８１５', 'nx'), ('网球拍', 'n'), ('ＰＲＯ―ＫＥＮＮＥＳ２７４', 'nx')], [('广', 'j'), ('深', 'j'), ('电气化', 'vn'), ('铁路', 'n'), ('全线', 'n'), ('建成', 'v')], [('广', 'j'), ('深', 'j'), ('铁路', 'n'), ('西', 'f'), ('起', 'v'), ('广州', 'ns'), ('东', 'f'), ('站', 'n'), ('，', 'w'), ('东', 'f'), ('至', 'v'), ('深圳', 'ns'), ('罗湖桥', 'ns'), ('，', 'w'), ('与', 'p'), ('广', 'j'), ('九', 'j'), ('铁路', 'n'), ('相', 'd'), ('连接', 'v'), ('，', 'w'), ('全长', 'n'), ('１３９．４６', 'm'), ('公里', 'q'), ('，', 'w'), ('途经', 'v'), ('下元', 'ns'), ('、', 'w'), ('茶山', 'ns'), ('、', 'w'), ('常平', 'ns'), ('、', 'w'), ('平湖', 'ns'), ('等', 'u'), ('２０', 'm'), ('个', 'q'), ('车站', 'n'), ('。', 'w'), ('设计', 'v'), ('旅客', 'n'), ('列车', 'n'), ('最高', 'a'), ('时速', 'n'), ('为', 'v'), ('２００', 'm'), ('公里', 'q'), ('，', 'w'), ('其中', 'r'), ('下元', 'ns'), ('至', 'p'), ('茶山', 'ns'), ('段', 'Ng'), ('有', 'v'), ('３０', 'm'), ('公里', 'q'), ('复线', 'n'), ('时速', 'n'), ('可', 'v'), ('达', 

### 2. 训练与评估CRF词性标注模型
- 接下来，我们将使用提取的特征和标签来训练CRF词性标注模型，并在测试集上评估其性能。

In [38]:
# 训练CRF模型
trainer = pycrfsuite.Trainer(verbose=True)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.set_params({
    'c1': 1.0,  # L1 penalty
    'c2': 1.0,  # L2 penalty
    'max_iterations': 100,  # max number of iterations
    'feature.possible_transitions': True
})
trainer.train('crf.model')

# 加载模型
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')

# 预测并评估
y_pred = [tagger.tag(xseq) for xseq in X_test]

# 将y_test和y_pred展平
y_test_flat = [item for sublist in y_test for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

# 评估结果
labels = list(tagger.labels())
print(classification_report(y_test_flat, y_pred_flat, labels=labels))


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 968275
Seconds required: 9.165

L-BFGS optimization
c1: 1.000000
c2: 1.000000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 20634888.553230
Feature norm: 1.000000
Error norm: 2063941.049914
Active features: 520209
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 41.047

***** Iteration #2 *****
Loss: 16331011.101322
Feature norm: 9.661943
Error norm: 2747549.152657
Active features: 516358
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 39.262

***** Iteration #3 *****
Loss: 13867029.670123
Feature norm: 9.300754
Error norm: 1999442.501933
Active features: 504511
Line search trials: 1
Line search step: 1.000000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 3. 训练CRF分词模型
- 数据来源：人民日报2014版分词数据集
- `train.data` , `test.data`, `verification.data`

In [42]:
# 加载训练和测试数据
train_seg_sents = load_data('train.data', split='\t')  
test_seg_sents = load_data('test.data', split='\t')  

# 提取特征和标签
X_seg_train = [sent2features(s) for s in train_seg_sents]
y_seg_train = [sent2labels(s) for s in train_seg_sents]
X_seg_test = [sent2features(s) for s in test_seg_sents]
y_seg_test = [sent2labels(s) for s in test_seg_sents]

# 打印数据集大小
print(f"训练集大小: {len(train_seg_sents)}")
print(f"测试集大小: {len(test_seg_sents)}")

# 训练CRF分词模型
seg_trainer = pycrfsuite.Trainer(verbose=True)
for xseq, yseq in zip(X_seg_train, y_seg_train):
    seg_trainer.append(xseq, yseq)
    
seg_trainer.set_params({
    'c1': 1.0,  # L1 penalty
    'c2': 1.0,  # L2 penalty
    'max_iterations': 100,  # max number of iterations
    'feature.possible_transitions': True
})
seg_trainer.train('crf_seg.model')

# 加载模型
seg_tagger = pycrfsuite.Tagger()
seg_tagger.open('crf_seg.model')

# 预测并评估
y_seg_pred = [seg_tagger.tag(xseq) for xseq in X_seg_test]

# 将y_test和y_pred展平
y_seg_test_flat = [item for sublist in y_seg_test for item in sublist]
y_seg_pred_flat = [item for sublist in y_seg_pred for item in sublist]

# 评估结果
seg_labels = list(seg_tagger.labels())
print(classification_report(y_seg_test_flat, y_seg_pred_flat, labels=seg_labels))

训练集大小: 103466
测试集大小: 47884
Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 75084
Seconds required: 3.191

L-BFGS optimization
c1: 1.000000
c2: 1.000000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 4814462.003258
Feature norm: 1.000000
Error norm: 1064403.721610
Active features: 66363
Line search trials: 1
Line search step: 0.000001
Seconds required for this iteration: 2.555

***** Iteration #2 *****
Loss: 3362928.436084
Feature norm: 3.137351
Error norm: 669948.994075
Active features: 64356
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 1.299

***** Iteration #3 *****
Loss: 2703609.839371
Feature norm: 4.511968
Error norm: 774726.829917
Active features: 65977
Line search trials: 1
Line searc