In [1]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git

Collecting git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git
  Cloning https://github.com/MeMartijn/updated-sklearn-crfsuite.git to /tmp/pip-req-build-t25gm14m
  Running command git clone --filter=blob:none --quiet https://github.com/MeMartijn/updated-sklearn-crfsuite.git /tmp/pip-req-build-t25gm14m
  Resolved https://github.com/MeMartijn/updated-sklearn-crfsuite.git to commit 675038761b4405f04691a83339d04903790e2b95
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite==0.3.6)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sklearn-crfsuite
  Building wheel for sklearn-crfsuite (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn-crfsuite: filename=sklearn_crfsuite-0.3.6-py2.py3-none-any.whl si

In [2]:
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/test.txt
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/valid.txt

--2024-03-30 15:53:14--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748093 (731K) [text/plain]
Saving to: ‘test.txt’


2024-03-30 15:53:15 (4.43 MB/s) - ‘test.txt’ saved [748093/748093]

--2024-03-30 15:53:15--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3283418 (3.1M) [text/plain]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn_crfsuite import CRF, metrics

In [4]:
def load_conll_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        for line in f:
            if line.strip():
                word, pos, chunk, ner = line.strip().split()
                sentence.append((word, pos, chunk, ner))
            else:
                data.append(sentence)
                sentence = []
    return data

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        next_word = sent[i+1][0]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, _, _, label in sent]

In [6]:
train_data = load_conll_data('train.txt')
test_data = load_conll_data('test.txt')

In [7]:
X_train = [sent2features(sent) for sent in train_data]
y_train = [sent2labels(sent) for sent in train_data]

X_test = [sent2features(sent) for sent in test_data]
y_test = [sent2labels(sent) for sent in test_data]

In [8]:
%%time
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)

try:
    crf.fit(X_train, y_train)
    print("Training successful!")
except AttributeError as e:
    print("Error occurred during training:", e)

Training successful!
CPU times: user 30 s, sys: 230 ms, total: 30.2 s
Wall time: 30.3 s


In [9]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORG', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']

Evaluation

In [10]:
y_pred = crf.predict(X_test)
f1 = metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)
print(f"The F1 Score is {f1}")

The F1 Score is 0.8066155156394734


In [11]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
sorted_labels

['B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER']

In [12]:
from sklearn_crfsuite.utils import flatten

print(metrics.flat_classification_report(y_test,
                                         y_pred,
                                         labels=sorted_labels,
                                         digits=3))

              precision    recall  f1-score   support

       B-LOC      0.857     0.829     0.843      1667
       I-LOC      0.785     0.712     0.747       257
      B-MISC      0.806     0.760     0.783       701
      I-MISC      0.639     0.645     0.642       214
       B-ORG      0.830     0.704     0.762      1660
       I-ORG      0.715     0.729     0.722       834
       B-PER      0.819     0.848     0.833      1616
       I-PER      0.852     0.955     0.900      1156

   micro avg      0.816     0.800     0.808      8105
   macro avg      0.788     0.773     0.779      8105
weighted avg      0.816     0.800     0.807      8105



In [13]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

Top likely transitions:
B-ORG  -> I-ORG   6.590739
B-PER  -> I-PER   5.954990
I-ORG  -> I-ORG   5.506068
B-MISC -> I-MISC  4.819992
I-MISC -> I-MISC  4.812910
B-LOC  -> I-LOC   4.273021
I-PER  -> I-PER   3.835088
I-LOC  -> I-LOC   3.694272
O      -> O       2.582599
O      -> B-MISC  2.017460
O      -> B-PER   1.919751
O      -> B-LOC   1.811063
O      -> B-ORG   1.482501
B-ORG  -> O       0.243393
B-MISC -> O       0.020722
B-LOC  -> B-MISC  -0.287340
B-LOC  -> O       -0.328965
I-MISC -> B-MISC  -0.414566
B-MISC -> B-ORG   -0.438994
I-PER  -> O       -0.534442
