In [1]:
# Language = "English"
Language = "Chinese"
mode = "train"
# mode = "test"
param_num = 1

# 如果要进行test，先把test.txt放到对应文件夹下，然后把mode改为"test"再运行

In [2]:
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))

from Part1.dataprocess import data_process, set_log, combine_data
from sklearn_crf import sent2features


set_log(None)
train_data, valid_data, test_data = data_process(f"../NER/{Language}", mode=mode)


x_train = [sent2features(sentence, param_num) for sentence, _ in train_data]
y_train = [label for _, label in train_data]
x_valid = [sent2features(sentence, param_num) for sentence, _ in valid_data]
y_valid = [label for _, label in valid_data]
if mode == "test":
    x_test = [sent2features(sentence, param_num) for sentence, _ in test_data]
    y_test = [label for _, label in test_data]

2023-11-26 17:25:24,355 P7004 INFO train dataset size: 3820
2023-11-26 17:25:24,356 P7004 INFO valid dataset size: 462


In [3]:
import pickle
from sklearn_crfsuite import CRF

if param_num == 0:
    crf_model = CRF(
        algorithm="lbfgs",
        c1=0.01,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 1:
    crf_model = CRF(
        algorithm="ap",
        max_iterations=300,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 2:
    raise NotImplementedError


if mode == "train":
    crf_model.fit(x_train, y_train)
    with open(f"crf_{Language}{param_num}.pkl", "wb") as f:
        pickle.dump(crf_model, f)
elif mode == "test":
    with open(f"crf_{Language}{param_num}.pkl", "rb") as f:
        crf_model = pickle.load(f)


if mode == "train":
    y_pred = crf_model.predict(x_valid)
    combined_data = combine_data([sentence for sentence, _ in valid_data], y_pred)
elif mode == "test":
    y_pred = crf_model.predict(x_test)
    combined_data = combine_data([sentence for sentence, _ in test_data], y_pred)


output_file = f"output_{Language}.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write(combined_data)

loading training data to CRFsuite: 100%|██████████| 3820/3820 [00:00<00:00, 9015.84it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 24925
Seconds required: 0.096

Averaged perceptron
max_iterations: 300
epsilon: 0.000000

Iter 1   time=0.14  loss=555.92   feature_norm=650.02
Iter 2   time=0.13  loss=291.97   feature_norm=797.79
Iter 3   time=0.13  loss=267.93   feature_norm=906.39
Iter 4   time=0.12  loss=244.72   feature_norm=1002.91
Iter 5   time=0.12  loss=227.90   feature_norm=1084.57
Iter 6   time=0.12  loss=194.15   feature_norm=1157.83
Iter 7   time=0.12  loss=186.78   feature_norm=1221.38
Iter 8   time=0.12  loss=171.89   feature_norm=1277.44
Iter 9   time=0.12  loss=148.34   feature_norm=1327.83
Iter 10  time=0.12  loss=158.30   feature_norm=1373.10
Iter 11  time=0.12  loss=151.62   feature_norm=1415.29
Iter 12  time=0.12  loss=157.66   feature_norm=1456.10
Iter 13  time=0.12  loss=138.66   feature_norm=1495.42
Iter 14 

In [4]:
from NER.check import check

if mode == "train":
    report = check(
        language=Language,
        gold_path=f"../NER/{Language}/validation.txt",
        my_path=output_file,
    )
elif mode == "test":
    report = check(
        language=Language,
        gold_path=f"../NER/{Language}/test.txt",
        my_path=output_file,
    )

              precision    recall  f1-score   support

      B-NAME     0.9423    0.9608    0.9515       102
      M-NAME     0.9351    0.9600    0.9474        75
      E-NAME     0.9515    0.9608    0.9561       102
      S-NAME     1.0000    1.0000    1.0000         8
      B-CONT     1.0000    1.0000    1.0000        33
      M-CONT     1.0000    1.0000    1.0000        64
      E-CONT     1.0000    1.0000    1.0000        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9550    1.0000    0.9770       106
       M-EDU     0.9465    1.0000    0.9725       177
       E-EDU     0.9369    0.9811    0.9585       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.8929    0.8955    0.8942       689
     M-TITLE     0.8821    0.8905    0.8863      1479
     E-TITLE     0.9783    0.9826    0.9804       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9363    0.9291    0.9327       522
       M-ORG     0.9314    