### CRF

In [1]:
# # install crf and nltk in python if not installed
# !pip install python-crfsuite
# !pip install nltk
# !pip install lxml
# !pip install html5lib

In [2]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)
    #Prepare labels as "N" representing part of a named entity and "I" for irrelevant word

docs[0]



[('Paxar', 'N'),
 ('Corp', 'N'),
 ('said', 'I'),
 ('it', 'I'),
 ('has', 'I'),
 ('acquired', 'I'),
 ('Thermo-Print', 'N'),
 ('GmbH', 'N'),
 ('of', 'I'),
 ('Lohn', 'N'),
 (',', 'I'),
 ('West', 'N'),
 ('Germany', 'N'),
 (',', 'I'),
 ('a', 'I'),
 ('distributor', 'I'),
 ('of', 'I'),
 ('Paxar', 'N'),
 ('products,', 'I'),
 ('for', 'I'),
 ('undisclosed', 'I'),
 ('terms.', 'I')]

In [3]:
import nltk
nltk.download('averaged_perceptron_tagger')

data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
data[0]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Savi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Paxar', 'NNP', 'N'),
 ('Corp', 'NNP', 'N'),
 ('said', 'VBD', 'I'),
 ('it', 'PRP', 'I'),
 ('has', 'VBZ', 'I'),
 ('acquired', 'VBN', 'I'),
 ('Thermo-Print', 'NNP', 'N'),
 ('GmbH', 'NNP', 'N'),
 ('of', 'IN', 'I'),
 ('Lohn', 'NNP', 'N'),
 (',', ',', 'I'),
 ('West', 'NNP', 'N'),
 ('Germany', 'NNP', 'N'),
 (',', ',', 'I'),
 ('a', 'DT', 'I'),
 ('distributor', 'NN', 'I'),
 ('of', 'IN', 'I'),
 ('Paxar', 'NNP', 'N'),
 ('products,', 'NN', 'I'),
 ('for', 'IN', 'I'),
 ('undisclosed', 'JJ', 'I'),
 ('terms.', 'NN', 'I')]

In [4]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [5]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 12931
Seconds required: 0.058

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5427.853159
Feature norm: 1.000000
Error norm: 5979.287320
Active features: 12511
Line search trials: 1
Line search step: 0.000046
Seconds required for this iteration: 0.016

***** Iteration #2 *****
Loss: 4337.099677
Feature norm: 0.840684
Error norm: 5346.231302
Active features: 12543
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #3 *****
Loss: 3846.403885
Feature norm: 0.815975
Error norm: 11708.398549
Active features: 8637
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #51 *****
Loss: 219.113595
Feature norm: 41.512739
Error norm: 7.222340
Active features: 2185
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #52 *****
Loss: 218.493207
Feature norm: 41.751239
Error norm: 20.736111
Active features: 2129
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #53 *****
Loss: 218.044768
Feature norm: 41.894721
Error norm: 19.489186
Active features: 2096
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #54 *****
Loss: 217.697238
Feature norm: 42.041961
Error norm: 12.454193
Active features: 2068
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #55 *****
Loss: 217.325900
Feature norm: 42.162485
Error norm: 12.146059
Active features: 2059
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #119 *****
Loss: 212.135890
Feature norm: 43.256186
Error norm: 9.808695
Active features: 1734
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #120 *****
Loss: 212.102099
Feature norm: 43.262005
Error norm: 5.744936
Active features: 1738
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #121 *****
Loss: 212.088378
Feature norm: 43.272397
Error norm: 8.187757
Active features: 1736
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #122 *****
Loss: 212.061066
Feature norm: 43.278413
Error norm: 5.653767
Active features: 1737
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #123 *****
Loss: 212.049741
Feature norm: 43.275676
Error norm: 8.311550
Active features: 1741
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #179 *****
Loss: 211.311460
Feature norm: 43.575270
Error norm: 3.500185
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #180 *****
Loss: 211.306828
Feature norm: 43.582839
Error norm: 5.347658
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #181 *****
Loss: 211.297531
Feature norm: 43.586942
Error norm: 3.377108
Active features: 1662
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #182 *****
Loss: 211.292899
Feature norm: 43.593504
Error norm: 5.149310
Active features: 1664
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #183 *****
Loss: 211.283403
Feature norm: 43.597933
Error norm: 3.976709
Active features: 1664
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

helen (N)
of (I)
troy (N)
corp (N)
said (I)
it (I)
filed (I)
with (I)
the (I)
securities (N)
and (N)
exchange (N)
commission (N)
a (I)
registration (I)
statement (I)
covering (I)
a (I)
20 (I)
mln (I)
dlr (I)
issue (I)
of (I)
covertible (I)
subordinated (I)
debentures (I)
due (I)
2007. (I)
proceeds (I)
will (I)
be (I)
used (I)
for (I)
general (I)
corporate (I)
purposes, (I)
including (I)
possible (I)
repayment (I)
of (I)
bank (I)
debt, (I)
product (I)
development (I)
and (I)
possible (I)
acquisitions, (I)
helen (N)
of (I)
troy (N)
said. (I)
the (I)
company (I)
named (I)
drexel (N)
burnham (N)
lambert (N)
inc (N)
as (I)
sole (I)
underwriter (I)
of (I)
the (I)
offering. (I)


In [8]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.98      0.99      0.99      3732
           N       0.89      0.86      0.88       416

    accuracy                           0.98      4148
   macro avg       0.94      0.93      0.93      4148
weighted avg       0.98      0.98      0.98      4148



In [9]:
# Set the first set of hyperparameters
trainer.set_params({
    'c1': 0.1,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

# Train the model with the new hyperparameters
trainer.train('crf_model_1.model')

# Load and evaluate the model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model_1.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Compute and print the classification report
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 12931
Seconds required: 0.064

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5427.853159
Feature norm: 1.000000
Error norm: 5979.287320
Active features: 12511
Line search trials: 1
Line search step: 0.000046
Seconds required for this iteration: 0.017

***** Iteration #2 *****
Loss: 4337.099677
Feature norm: 0.840684
Error norm: 5346.231302
Active features: 12543
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #3 *****
Loss: 3846.403885
Feature norm: 0.815975
Error norm: 11708.398549
Active features: 8637
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #48 *****
Loss: 220.934094
Feature norm: 41.245753
Error norm: 13.710706
Active features: 2304
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #49 *****
Loss: 220.245252
Feature norm: 41.268366
Error norm: 19.173781
Active features: 2241
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #50 *****
Loss: 219.683884
Feature norm: 41.408840
Error norm: 4.638756
Active features: 2224
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #51 *****
Loss: 219.113595
Feature norm: 41.512739
Error norm: 7.222340
Active features: 2185
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #52 *****
Loss: 218.493207
Feature norm: 41.751239
Error norm: 20.736111
Active features: 2129
Line search trials: 1
Line search step: 1.000000
Seconds required for this 

***** Iteration #92 *****
Loss: 212.963344
Feature norm: 42.965779
Error norm: 2.829985
Active features: 1773
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #93 *****
Loss: 212.927451
Feature norm: 42.984755
Error norm: 5.324860
Active features: 1765
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #94 *****
Loss: 212.909866
Feature norm: 42.981908
Error norm: 12.823984
Active features: 1766
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #95 *****
Loss: 212.855533
Feature norm: 43.006177
Error norm: 3.025217
Active features: 1774
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #96 *****
Loss: 212.823727
Feature norm: 43.023259
Error norm: 7.160203
Active features: 1773
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #136 *****
Loss: 211.792105
Feature norm: 43.357564
Error norm: 8.494735
Active features: 1714
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #137 *****
Loss: 211.769056
Feature norm: 43.366408
Error norm: 4.765600
Active features: 1714
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #138 *****
Loss: 211.763376
Feature norm: 43.375036
Error norm: 9.518544
Active features: 1712
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #139 *****
Loss: 211.738060
Feature norm: 43.384038
Error norm: 6.007385
Active features: 1712
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #140 *****
Loss: 211.733572
Feature norm: 43.396251
Error norm: 10.117921
Active features: 1706
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #194 *****
Loss: 211.207190
Feature norm: 43.662856
Error norm: 4.178583
Active features: 1662
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #195 *****
Loss: 211.201850
Feature norm: 43.667361
Error norm: 3.747798
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #196 *****
Loss: 211.198267
Feature norm: 43.671271
Error norm: 4.672341
Active features: 1660
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #197 *****
Loss: 211.191847
Feature norm: 43.677601
Error norm: 3.309217
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #198 *****
Loss: 211.188855
Feature norm: 43.681043
Error norm: 4.825530
Active features: 1655
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [10]:
# Set the second set of hyperparameters
trainer.set_params({
    'c1': 0.5,      # Change L1 regularization coefficient
    'c2': 0.05,     # Change L2 regularization coefficient
    'max_iterations': 300,  # Increase the number of training epochs
    'feature.possible_transitions': True,
})

# Train the model with the new hyperparameters
trainer.train('crf_model_2.model')

# Load and evaluate the model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model_2.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Compute and print the classification report
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 12931
Seconds required: 0.062

L-BFGS optimization
c1: 0.500000
c2: 0.050000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5431.019004
Feature norm: 1.000000
Error norm: 5981.158259
Active features: 5499
Line search trials: 1
Line search step: 0.000046
Seconds required for this iteration: 0.016

***** Iteration #2 *****
Loss: 4339.667595
Feature norm: 0.840532
Error norm: 5346.601280
Active features: 6995
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #3 *****
Loss: 3851.400914
Feature norm: 0.814680
Error norm: 11711.073121
Active features: 3910
Line search trials: 2
Line search step: 0.500000
Seconds required for this it

***** Iteration #42 *****
Loss: 564.158637
Feature norm: 19.844616
Error norm: 45.327513
Active features: 1397
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.011

***** Iteration #43 *****
Loss: 562.044177
Feature norm: 20.023014
Error norm: 23.354632
Active features: 1346
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #44 *****
Loss: 560.514722
Feature norm: 20.241558
Error norm: 58.372011
Active features: 1306
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #45 *****
Loss: 559.335347
Feature norm: 20.219065
Error norm: 10.406000
Active features: 1303
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #46 *****
Loss: 557.926478
Feature norm: 20.246539
Error norm: 16.156644
Active features: 1282
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #88 *****
Loss: 547.420172
Feature norm: 20.640022
Error norm: 7.247356
Active features: 1024
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.014

***** Iteration #89 *****
Loss: 547.402636
Feature norm: 20.640932
Error norm: 16.821978
Active features: 1025
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #90 *****
Loss: 547.367486
Feature norm: 20.638593
Error norm: 18.858745
Active features: 1026
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #91 *****
Loss: 547.344546
Feature norm: 20.639087
Error norm: 23.792935
Active features: 1022
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #92 *****
Loss: 547.298204
Feature norm: 20.635914
Error norm: 22.687013
Active features: 1024
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #140 *****
Loss: 546.165450
Feature norm: 20.931659
Error norm: 13.894530
Active features: 994
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #141 *****
Loss: 546.148573
Feature norm: 20.936363
Error norm: 3.185443
Active features: 993
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #142 *****
Loss: 546.141954
Feature norm: 20.937716
Error norm: 7.266713
Active features: 998
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #143 *****
Loss: 546.132313
Feature norm: 20.938490
Error norm: 2.978796
Active features: 996
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #144 *****
Loss: 546.121643
Feature norm: 20.939631
Error norm: 6.334256
Active features: 999
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #187 *****
Loss: 545.850248
Feature norm: 21.023698
Error norm: 2.957299
Active features: 989
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.017

***** Iteration #188 *****
Loss: 545.846214
Feature norm: 21.026254
Error norm: 5.907062
Active features: 989
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.017

***** Iteration #189 *****
Loss: 545.844559
Feature norm: 21.034517
Error norm: 9.451134
Active features: 985
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #190 *****
Loss: 545.836006
Feature norm: 21.040065
Error norm: 8.016725
Active features: 983
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #191 *****
Loss: 545.830501
Feature norm: 21.044093
Error norm: 2.381506
Active features: 983
Line search trials: 1
Line search step: 1.000000
Seconds required for this ite

***** Iteration #232 *****
Loss: 545.716053
Feature norm: 21.089997
Error norm: 5.947985
Active features: 973
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #233 *****
Loss: 545.712597
Feature norm: 21.091474
Error norm: 2.492615
Active features: 973
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #234 *****
Loss: 545.711112
Feature norm: 21.092034
Error norm: 4.964218
Active features: 975
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #235 *****
Loss: 545.708353
Feature norm: 21.093288
Error norm: 2.350722
Active features: 977
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.019

***** Iteration #236 *****
Loss: 545.707190
Feature norm: 21.094144
Error norm: 5.584628
Active features: 977
Line search trials: 2
Line search step: 0.500000
Seconds required for this ite

***** Iteration #273 *****
Loss: 545.644831
Feature norm: 21.120179
Error norm: 1.955797
Active features: 967
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #274 *****
Loss: 545.643595
Feature norm: 21.120310
Error norm: 2.438706
Active features: 967
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 0.023

***** Iteration #275 *****
Loss: 545.643384
Feature norm: 21.122076
Error norm: 6.560129
Active features: 967
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #276 *****
Loss: 545.637739
Feature norm: 21.121993
Error norm: 1.950380
Active features: 967
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.015

***** Iteration #277 *****
Loss: 545.637681
Feature norm: 21.122538
Error norm: 4.531704
Active features: 967
Line search trials: 2
Line search step: 0.500000
Seconds required for this ite

In [11]:
# Set the third set of hyperparameters
trainer.set_params({
    'c1': 0.1,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
})

# Train the model with the new hyperparameters
trainer.train('crf_model_3.model')

# Load and evaluate the model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model_3.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Compute and print the classification report
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 12931
Seconds required: 0.057

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5427.853159
Feature norm: 1.000000
Error norm: 5979.287320
Active features: 12511
Line search trials: 1
Line search step: 0.000046
Seconds required for this iteration: 0.015

***** Iteration #2 *****
Loss: 4337.099677
Feature norm: 0.840684
Error norm: 5346.231302
Active features: 12543
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #3 *****
Loss: 3846.403885
Feature norm: 0.815975
Error norm: 11708.398549
Active features: 8637
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #43 *****
Loss: 226.617174
Feature norm: 40.191815
Error norm: 35.641101
Active features: 2598
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #44 *****
Loss: 224.862525
Feature norm: 40.447207
Error norm: 20.309378
Active features: 2512
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #45 *****
Loss: 223.797209
Feature norm: 40.630199
Error norm: 14.877691
Active features: 2466
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #46 *****
Loss: 222.755920
Feature norm: 40.951965
Error norm: 11.868643
Active features: 2409
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #47 *****
Loss: 221.864172
Feature norm: 41.047961
Error norm: 17.984719
Active features: 2381
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #111 *****
Loss: 212.312286
Feature norm: 43.176423
Error norm: 9.192326
Active features: 1753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #112 *****
Loss: 212.280095
Feature norm: 43.180976
Error norm: 5.582030
Active features: 1753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #113 *****
Loss: 212.263320
Feature norm: 43.192253
Error norm: 7.904086
Active features: 1753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #114 *****
Loss: 212.233866
Feature norm: 43.196564
Error norm: 4.610047
Active features: 1753
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #115 *****
Loss: 212.219872
Feature norm: 43.216136
Error norm: 8.066868
Active features: 1751
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #179 *****
Loss: 211.311460
Feature norm: 43.575270
Error norm: 3.500185
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #180 *****
Loss: 211.306828
Feature norm: 43.582839
Error norm: 5.347658
Active features: 1658
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #181 *****
Loss: 211.297531
Feature norm: 43.586942
Error norm: 3.377108
Active features: 1662
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #182 *****
Loss: 211.292899
Feature norm: 43.593504
Error norm: 5.149310
Active features: 1664
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #183 *****
Loss: 211.283403
Feature norm: 43.597933
Error norm: 3.976709
Active features: 1664
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [13]:
# Set the first set of hyperparameters
trainer.set_params({
    'c1': 0.1,
    'c2': 0.01,
    'max_iterations': 200,
    'feature.possible_transitions': True,
    'calibration.eta': 0.001,  # Learning rate
})

# Train the model with the new hyperparameters
trainer.train('crf_model_1.model')

# Load and evaluate the model
tagger = pycrfsuite.Tagger()
tagger.open('crf_model_1.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Compute and print the classification report
# You can also repeat this process for the other hyperparameter sets
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))


ValueError: Parameter not found: calibration.eta = 0.001