In [13]:
# Language = "English"
Language = "Chinese"
mode = "test"
param_num = 1

In [14]:
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))

from Part1.dataprocess import data_process, set_log, combine_data
from sklearn_crf import sent2features


set_log(None)
train_data, valid_data, test_data = data_process(f"../NER/{Language}", mode=mode)

x_train = [sent2features(sentence, Language, param_num) for sentence, _ in train_data]
y_train = [label for _, label in train_data]
x_valid = [sent2features(sentence, Language, param_num) for sentence, _ in valid_data]
y_valid = [label for _, label in valid_data]
x_test = [sent2features(sentence, Language, param_num) for sentence, _ in test_data]
y_test = [label for _, label in test_data]

2023-11-28 21:16:24,336 P31300 INFO train dataset size: 3820
2023-11-28 21:16:24,336 P31300 INFO valid dataset size: 462


2023-11-28 21:16:24,337 P31300 INFO test dataset size: 476


In [15]:
import pickle
from sklearn_crfsuite import CRF


if param_num == 0:
    crf_model = CRF(
        algorithm="lbfgs",
        c1=0.01,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 1:
    crf_model = CRF(
        algorithm="ap",
        max_iterations=300,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 2:
    raise NotImplementedError

with open(f"crf_{Language}{param_num}.pkl", "rb") as f:
    crf_model = pickle.load(f)

y_pred = crf_model.predict(x_test)
combined_data = combine_data([sentence for sentence, _ in test_data], y_pred)

output_file = f"output_{Language}.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write(combined_data)

In [16]:
from NER.check import check

report = check(
    language=Language,
    gold_path=f"../NER/{Language}/test.txt",
    my_path=output_file,
)

              precision    recall  f1-score   support

      B-NAME     0.9649    0.9821    0.9735       112
      M-NAME     0.9518    0.9634    0.9576        82
      E-NAME     0.9735    0.9821    0.9778       112
      S-NAME     0.0000    0.0000    0.0000         0
      B-CONT     1.0000    1.0000    1.0000        28
      M-CONT     1.0000    1.0000    1.0000        53
      E-CONT     1.0000    1.0000    1.0000        28
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9732    0.9732    0.9732       112
       M-EDU     0.9609    0.9609    0.9609       179
       E-EDU     0.9821    0.9821    0.9821       112
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.9365    0.9390    0.9377       770
     M-TITLE     0.9544    0.9037    0.9283      1921
     E-TITLE     0.9818    0.9831    0.9825       770
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9299    0.9620    0.9457       552
       M-ORG     0.9384    