## Named Entity Recognition using Conditional Random Fields (CRF)

In [None]:
!pip install sklearn_crfsuite

In [None]:
!pip install datasets

In [None]:
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from datasets import load_dataset
import os

In [None]:
# Load spaCy for PoS tagging and rule-based NER
nlp = spacy.load('en_core_web_sm')


In [None]:
# Function to load WNUT-17 from files (fallback)
def load_wnut17_file(file_path):
    sentences = []
    current_sentence = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                parts = line.split('\t')
                if len(parts) == 2:  # word, ner
                    current_sentence.append((parts[0], parts[1]))
            elif current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        if current_sentence:
            sentences.append(current_sentence)
    return sentences


In [None]:
# Add PoS tags to WNUT-17 data using spaCy
def add_pos_tags(sentences):
    tagged_sentences = []
    for sent in sentences:
        words = [word for word, _ in sent]
        doc = nlp(' '.join(words))
        tagged_sent = [(word, token.pos_, ner) for (word, ner), token in zip(sent, doc)]
        tagged_sentences.append(tagged_sent)
    return tagged_sentences

In [None]:
# Feature extraction
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, _, label in sent]

In [None]:
# Load WNUT-17 dataset
try:
    wnut = load_dataset("wnut_17")
    train_sents = [[(token, tag) for token, tag in zip(sent['tokens'], sent['ner_tags'])] for sent in wnut['train']][:1000]
    test_sents = [[(token, tag) for token, tag in zip(sent['tokens'], sent['ner_tags'])] for sent in wnut['test']][:200]
    print("Loaded WNUT-17 from Hugging Face")
except Exception as e:
    print(f"Failed to load WNUT-17 from Hugging Face: {e}")
    # Fallback: Load from files
    train_file = "wnut_17.train.conll"
    test_file = "wnut_17.test.conll"
    if os.path.exists(train_file) and os.path.exists(test_file):
        train_sents = load_wnut17_file(train_file)[:1000]
        test_sents = load_wnut17_file(test_file)[:200]
        print("Loaded WNUT-17 from files")
    else:
        raise FileNotFoundError("Download wnut_17.train.conll and wnut_17.test.conll from "
                               "https://github.com/leondz/wnut17 and place them in the project directory.")


Loaded WNUT-17 from Hugging Face


In [None]:
# Convert NER tags to string labels
ner_tags = ['O', 'B-person', 'I-person', 'B-organization', 'I-organization', 'B-location', 'I-location',
            'B-product', 'I-product', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group']
train_sents = [[(word, ner_tags[tag]) for word, tag in sent] for sent in train_sents]
test_sents = [[(word, ner_tags[tag]) for word, tag in sent] for sent in test_sents]


In [None]:
# Add PoS tags
train_sents = add_pos_tags(train_sents)
test_sents = add_pos_tags(test_sents)

In [None]:
# Prepare data
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:

# Train CRF
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100)
crf.fit(X_train, y_train)


In [None]:
# Predict and evaluate CRF
y_pred = crf.predict(X_test)
print("\nCRF Metrics:")
labels = ['O', 'B-person', 'I-person', 'B-organization', 'I-organization', 'B-location', 'I-location']
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))



CRF Metrics:
                precision    recall  f1-score   support

             O       0.94      1.00      0.97      2706
      B-person       0.00      0.00      0.00        10
      I-person       0.00      0.00      0.00         4
B-organization       0.00      0.00      0.00         7
I-organization       0.00      0.00      0.00         7
    B-location       0.00      0.00      0.00        42
    I-location       0.00      0.00      0.00        16

     micro avg       0.93      0.97      0.95      2792
     macro avg       0.13      0.14      0.14      2792
  weighted avg       0.91      0.97      0.94      2792



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Rule-based NER with spaCy
def spacy_ner(sent):
    words = [token for token, _, _ in sent]
    doc = nlp(' '.join(words))
    spacy_labels = ['O'] * len(sent)
    for ent in doc.ents:
        # Find the starting token index by matching words
        ent_words = ent.text.split()
        for i, (token, pos, _) in enumerate(sent):
            if token == ent_words[0] and i < len(sent):
                ent_start = i
                for j, ent_token in enumerate(ent_words):
                    if i + j < len(sent) and sent[i + j][0] == ent_token:
                        label = ent.label_
                        if label == 'PERSON':
                            spacy_labels[i + j] = 'B-person' if j == 0 else 'I-person'
                        elif label == 'ORG':
                            spacy_labels[i + j] = 'B-organization' if j == 0 else 'I-organization'
                        elif label == 'GPE':
                            spacy_labels[i + j] = 'B-location' if j == 0 else 'I-location'
                break
    return spacy_labels

In [None]:
# Evaluate spaCy
spacy_preds = [spacy_ner(sent) for sent in test_sents]
print("\nSpaCy Metrics:")
flat_y_test = [label for sent in y_test for label in sent]
flat_spacy_preds = [label for sent in spacy_preds for label in sent]
print(classification_report(flat_y_test, flat_spacy_preds, labels=labels))


SpaCy Metrics:
                precision    recall  f1-score   support

             O       0.96      0.98      0.97      2706
      B-person       0.00      0.00      0.00        10
      I-person       0.00      0.00      0.00         4
B-organization       0.02      0.14      0.03         7
I-organization       0.03      0.14      0.05         7
    B-location       0.19      0.14      0.16        42
    I-location       0.00      0.00      0.00        16

     micro avg       0.91      0.96      0.93      2792
     macro avg       0.17      0.20      0.17      2792
  weighted avg       0.94      0.96      0.95      2792



In [None]:
# Sample input/output
sample_sent = train_sents[0]
sample_features = sent2features(sample_sent)
sample_pred = crf.predict([sample_features])[0]
print("\nSample Input:", [token for token, _, _ in sample_sent])
print("Predicted Tags:", sample_pred)


Sample Input: ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
Predicted Tags: ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-product'
 'I-product' 'I-product' 'O' 'B-product' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O']
