# Named Entity Recognition (NER)

In [5]:
import nltk
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Download and load the dataset
nltk.download('conll2002')
dataset = nltk.corpus.conll2002.iob_sents('esp.train')

# Convert dataset into train/test splits
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print("Training set size:", len(train_data))
print("Test set size:", len(test_data))
print("Sample data:\n", train_data[0])

[nltk_data] Downloading package conll2002 to /home/omar/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


Training set size: 6658
Test set size: 1665
Sample data:
 [('Así', 'RG', 'O'), ('lo', 'PP', 'O'), ('manifestó', 'VMI', 'O'), ('hoy', 'RG', 'O'), ('el', 'DA', 'O'), ('portavoz', 'NC', 'O'), ('del', 'SP', 'O'), ('Grupo', 'NC', 'B-ORG'), ('Parlamentario', 'AQ', 'I-ORG'), ('Socialista', 'AQ', 'I-ORG'), (',', 'Fc', 'O'), ('Jaime', 'VMI', 'B-PER'), ('González', 'AQ', 'I-PER'), (',', 'Fc', 'O'), ('después', 'RG', 'O'), ('del', 'SP', 'O'), ('encuentro', 'NC', 'O'), ('que', 'PR', 'O'), ('mantuvieron', 'VMI', 'O'), ('hoy', 'RG', 'O'), ('en', 'SP', 'O'), ('Astorga', 'VMN', 'B-LOC'), ('una', 'DI', 'O'), ('treintena', 'NC', 'O'), ('de', 'SP', 'O'), ('representantes', 'NC', 'O'), ('del', 'SP', 'O'), ('PSOE', 'NC', 'B-ORG'), ('en', 'SP', 'O'), ('las', 'DA', 'O'), ('Cortes', 'NC', 'B-ORG'), (',', 'Fc', 'O'), ('para', 'SP', 'O'), ('analizar', 'VMN', 'O'), ('la', 'DA', 'O'), ('actualidad', 'NC', 'O'), ('política', 'AQ', 'O'), ('en', 'SP', 'O'), ('la', 'DA', 'O'), ('Autonomía', 'NC', 'B-ORG'), ('.', 'Fp'

## 1. Feature Extraction for CRFs 🛠️
Extract features from words in sentences to feed into the CRF model.

In [6]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

# Extract features and labels for train/test sets
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

print("Sample features:\n", X_train[0][0])

Sample features:
 {'bias': 1.0, 'word.lower()': 'así', 'word[-3:]': 'Así', 'word[-2:]': 'sí', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'RG', 'postag[:2]': 'RG', 'BOS': True, '+1:word.lower()': 'lo', '+1:postag': 'PP', '+1:postag[:2]': 'PP'}


## 2. Training the CRF Model 🏋️‍♂️
Train a CRF model using the extracted features and labels.

In [7]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Train the model
crf.fit(X_train, y_train)

print("Model trained successfully.")

Model trained successfully.


## 3. Model Evaluation 📊
Evaluate the performance of the CRF model using metrics like precision, recall, F1-score, and overall accuracy.

In [8]:
y_pred = crf.predict(X_test)

# Evaluate the model
labels = list(crf.classes_)
labels.remove('O') # Remove 'O' label from evaluation

print("Classification Report:\n")
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

# Calculate overall accuracy
accuracy = metrics.flat_accuracy_score(y_test, y_pred)
print("Overall Accuracy:", accuracy)

Classification Report:

              precision    recall  f1-score   support

       B-ORG      0.860     0.866     0.863      1444
       I-ORG      0.741     0.839     0.787       933
       B-PER      0.877     0.918     0.897       768
       I-PER      0.901     0.955     0.927       713
       B-LOC      0.826     0.810     0.818       995
      B-MISC      0.794     0.590     0.677       439
      I-MISC      0.743     0.574     0.648       705
       I-LOC      0.816     0.686     0.745       420

   micro avg      0.826     0.807     0.816      6417
   macro avg      0.820     0.780     0.795      6417
weighted avg      0.824     0.807     0.812      6417

Overall Accuracy: 0.9731709637522595


## 4. Model Analysis and Error Inspection 🔍
Analyze the model’s performance, focusing on areas where it may have misclassified entities.

In [9]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print(f"{label_from:6} -> {label_to:7} {weight:.6f}")

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print(f"{label:7} {weight:.6f} {attr}")

# Most common transitions
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

# Least common transitions
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

# Most important features
print("\nTop positive features:")
print_state_features(Counter(crf.state_features_).most_common(10))

print("\nTop negative features:")
print_state_features(Counter(crf.state_features_).most_common()[-10:])

Top likely transitions:
I-MISC -> I-MISC  6.187372
B-ORG  -> I-ORG   5.530408
B-PER  -> I-PER   5.527766
I-ORG  -> I-ORG   5.368968
B-LOC  -> I-LOC   5.051907
I-LOC  -> I-LOC   4.919433
B-MISC -> I-MISC  4.807263
I-PER  -> I-PER   4.235491
O      -> O       3.691834
O      -> B-MISC  2.260971

Top unlikely transitions:
I-ORG  -> B-LOC   -3.044190
B-ORG  -> B-ORG   -3.232873
B-MISC -> B-MISC  -3.324286
I-PER  -> B-ORG   -3.418804
B-ORG  -> B-MISC  -3.576592
B-PER  -> B-PER   -4.054614
O      -> I-PER   -5.490260
O      -> I-MISC  -5.520934
O      -> I-LOC   -6.062081
O      -> I-ORG   -6.212764

Top positive features:
B-ORG   8.867643 word.lower():efe-cantabria
B-ORG   8.466314 word.lower():psoe-progresistas
O       5.326065 BOS
B-ORG   4.875931 word.lower():xfera
I-LOC   4.673665 -1:word.lower():calle
B-ORG   4.609254 word[-2:]:-e
B-MISC  4.588559 word.lower():diversia
B-LOC   4.557490 word.lower():líbano
B-ORG   4.531936 word.lower():telefónica
B-PER   4.350310 -1:word.lower():según



## Conclusion
The CRF model is a powerful tool for named entity recognition tasks, as it can capture complex patterns in the data and make predictions based on these patterns. By extracting features from the text and training the model on labeled data, we can build a robust NER system that can identify entities in unstructured text data.