In [2]:
pip install nltk scikit-learn



In [10]:
import nltk
import random
from nltk.corpus import conll2002
from nltk.tag import ClassifierBasedTagger
from nltk.classify import MaxentClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [4]:
# Download required datasets
nltk.download('conll2002')
nltk.download('punkt')

[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Feature Extraction Function

In [5]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }

    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        next_word = sent[i+1][0]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True

    return features


# 2. Prepare Dataset

In [6]:
def prepare_dataset(tagged_sents):
    X = []
    y = []

    for sent in tagged_sents:
        for i in range(len(sent)):
            features = word2features(sent, i)
            label = sent[i][2]  # Named entity tag
            X.append(features)
            y.append(label)

    return X, y


# Load Spanish dataset from CoNLL 2002
train_sents = conll2002.iob_sents('esp.train')
test_sents = conll2002.iob_sents('esp.testb')

# Prepare features and labels
X_train, y_train = prepare_dataset(train_sents)
X_test, y_test = prepare_dataset(test_sents)


# 3. Train Maximum Entropy Classifier

In [7]:
print("Training model... This may take few minutes.")

classifier = MaxentClassifier.train(
    list(zip(X_train, y_train)),
    algorithm='iis',
    max_iter=10
)

Training model... This may take few minutes.
  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.19722        0.876
             2          -0.21914        0.876
             3          -0.19771        0.881
             4          -0.17512        0.900
             5          -0.15658        0.919
             6          -0.14197        0.933
             7          -0.13038        0.943
             8          -0.12103        0.951
             9          -0.11334        0.956
         Final          -0.10691        0.960


# 4. Prediction

In [8]:
y_pred = []

for features in X_test:
    y_pred.append(classifier.classify(features))

# 5. Evaluation Metrics

In [9]:
# Encode labels numerically
le = LabelEncoder()
y_test_enc = le.fit_transform(y_test)
y_pred_enc = le.transform(y_pred)

accuracy = accuracy_score(y_test_enc, y_pred_enc)

print("\nModel Evaluation Results")
print("="*40)
print(f"Accuracy: {accuracy:.4f}")
print("\nDetailed Classification Report:\n")
print(classification_report(y_test_enc, y_pred_enc, target_names=le.classes_))


Model Evaluation Results
Accuracy: 0.9416

Detailed Classification Report:

              precision    recall  f1-score   support

       B-LOC       0.74      0.63      0.68      1084
      B-MISC       0.74      0.26      0.38       339
       B-ORG       0.85      0.68      0.75      1400
       B-PER       0.91      0.71      0.80       735
       I-LOC       0.82      0.26      0.39       325
      I-MISC       0.68      0.09      0.16       557
       I-ORG       0.81      0.29      0.43      1104
       I-PER       0.87      0.82      0.85       634
           O       0.95      1.00      0.97     45355

    accuracy                           0.94     51533
   macro avg       0.82      0.53      0.60     51533
weighted avg       0.93      0.94      0.93     51533

