In [2]:
pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Collecting python-crfsuite>=0.9.7
  Downloading python_crfsuite-0.9.10-cp310-cp310-win_amd64.whl (155 kB)
     -------------------------------------- 155.5/155.5 kB 1.3 MB/s eta 0:00:00
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.5.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
import re
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

# Sample training data
training_sentences = [
    "Apple is my favorite fruit",
    "I bought an Apple iPhone recently.",
    "President Trump is visiting India next year",
    "It is rumored Newton discovered gravity because of an apple",
    "Gravity is a movie starring Sandra Bullock",
    "Sandra Bullock is not one of my favorite actors, but she is really good as an actor",
    "Tom Cruise is an actor i would like to host at my home for dinner",
    "This sentence has no names in it.",
    "I have always dreamt of being on a cruise",
    "This course is really fantastic."
]

# Function to extract features from sentences
def sentence_features(sentence):
    words = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)
    features = []
    for i in range(len(words)):
        word = words[i]
        postag = pos_tags[i][1]
        features.append({
            'word': word,
            'postag': postag,
            'is_first': i == 0,
            'is_last': i == len(words) - 1,
            'is_capitalized': word[0].upper() == word[0],
            'is_all_caps': word.upper() == word,
            'is_all_lower': word.lower() == word,
            'prefix-1': word[0],
            'prefix-2': word[:2],
            'prefix-3': word[:3],
            'suffix-1': word[-1],
            'suffix-2': word[-2:],
            'suffix-3': word[-3:],
            'prev_word': words[i-1] if i > 0 else '',
            'next_word': words[i+1] if i < len(words)-1 else ''
        })
    return features

# Function to extract labels from sentences
def sentence_labels(sentence):
    labels = []
    words = nltk.word_tokenize(sentence)
    for word in words:
        labels.append('O')  # Default label 'O' (Outside) for each word
        # Label specific entities (people and organizations) in the training sentences
        if re.match(r'^[A-Z][a-z]*$', word):
            labels[-1] = 'B-PERSON'  # Beginning of a person's name
        elif word == 'Apple':
            labels[-1] = 'B-ORG'  # Beginning of an organization's name
    return labels

# Prepare training data with features and labels
X = [sentence_features(sent) for sent in training_sentences]
y = [sentence_labels(sent) for sent in training_sentences]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Train the CRF model
crf.fit(X_train, y_train)

# Predict labels on test data
y_pred = crf.predict(X_test)

# Evaluate model performance
print("CRF Model Metrics:")
print(metrics.flat_classification_report(y_test, y_pred, labels=['B-PERSON', 'I-PERSON', 'B-ORG', 'I-ORG'], digits=3))

# Function to predict named entities in a new sentence
def predict_named_entities(sentence):
    features = sentence_features(sentence)
    return crf.predict_single(features)

# Test the model on a new sentence
test_sentence = "Tom Cruise uses an Apple iPhone which was manufactured in India"
predicted_labels = predict_named_entities(test_sentence)
print("\nNamed Entities Predicted:")
for word, label in zip(nltk.word_tokenize(test_sentence), predicted_labels):
    print(f"{word}: {label}")


CRF Model Metrics:
              precision    recall  f1-score   support

    B-PERSON      1.000     1.000     1.000         3
    I-PERSON      0.000     0.000     0.000         0
       B-ORG      0.000     0.000     0.000         0
       I-ORG      0.000     0.000     0.000         0

   micro avg      1.000     1.000     1.000         3
   macro avg      0.250     0.250     0.250         3
weighted avg      1.000     1.000     1.000         3


Named Entities Predicted:
Tom: B-PERSON
Cruise: B-PERSON
uses: O
an: O
Apple: B-PERSON
iPhone: O
which: O
was: O
manufactured: O
in: O
India: B-PERSON


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
