In [1]:
Language = "English"
# Language = "Chinese"
mode = "train"
param_num = 1

In [2]:
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))

from Part1.dataprocess import data_process, set_log, combine_data
from sklearn_crf import sent2features


set_log(None)
train_data, valid_data, test_data = data_process(f"../NER/{Language}", mode=mode)

x_train = [sent2features(sentence, Language, param_num) for sentence, _ in train_data]
y_train = [label for _, label in train_data]
x_valid = [sent2features(sentence, Language, param_num) for sentence, _ in valid_data]
y_valid = [label for _, label in valid_data]

2023-11-26 17:56:22,294 P19552 INFO train dataset size: 14041
2023-11-26 17:56:22,295 P19552 INFO valid dataset size: 3250


In [3]:
import pickle
from sklearn_crfsuite import CRF

if param_num == 0:
    crf_model = CRF(
        algorithm="lbfgs",
        c1=0.01,
        c2=0.01,
        max_iterations=200,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 1:
    crf_model = CRF(
        algorithm="ap",
        max_iterations=300,
        all_possible_transitions=True,
        verbose=True,
    )
elif param_num == 2:
    raise NotImplementedError

crf_model.fit(x_train, y_train)
with open(f"crf_{Language}{param_num}.pkl", "wb") as f:
    pickle.dump(crf_model, f)

y_pred = crf_model.predict(x_valid)
combined_data = combine_data([sentence for sentence, _ in valid_data], y_pred)

output_file = f"output_{Language}.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write(combined_data)

loading training data to CRFsuite: 100%|██████████| 14041/14041 [00:00<00:00, 15566.24it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 108083
Seconds required: 0.212

Averaged perceptron
max_iterations: 300
epsilon: 0.000000

Iter 1   time=0.09  loss=2258.15  feature_norm=828.10
Iter 2   time=0.08  loss=1480.59  feature_norm=1099.65
Iter 3   time=0.08  loss=1229.77  feature_norm=1297.52
Iter 4   time=0.08  loss=1068.97  feature_norm=1458.81
Iter 5   time=0.07  loss=954.78   feature_norm=1598.40
Iter 6   time=0.07  loss=825.02   feature_norm=1719.88
Iter 7   time=0.07  loss=770.32   feature_norm=1828.63
Iter 8   time=0.07  loss=719.65   feature_norm=1926.93
Iter 9   time=0.07  loss=681.09   feature_norm=2017.71
Iter 10  time=0.07  loss=651.48   feature_norm=2102.26
Iter 11  time=0.07  loss=582.99   feature_norm=2181.32
Iter 12  time=0.07  loss=572.69   feature_norm=2255.51
Iter 13  time=0.07  loss=535.43   feature_norm=2325.31
Iter 

In [4]:
from NER.check import check

report = check(
    language=Language,
    gold_path=f"../NER/{Language}/validation.txt",
    my_path=output_file,
)

              precision    recall  f1-score   support

       B-PER     0.9062    0.9175    0.9118      1842
       I-PER     0.9516    0.9472    0.9494      1307
       B-ORG     0.8672    0.8233    0.8447      1341
       I-ORG     0.8666    0.8389    0.8525       751
       B-LOC     0.9226    0.9020    0.9122      1837
       I-LOC     0.9231    0.8405    0.8798       257
      B-MISC     0.9130    0.8427    0.8765       922
      I-MISC     0.9055    0.7197    0.8019       346

   micro avg     0.9086    0.8789    0.8935      8603
   macro avg     0.9070    0.8540    0.8786      8603
weighted avg     0.9083    0.8789    0.8928      8603

