In [2]:
import nltk
from nltk.corpus import inaugural
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('punkt')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
tagged_sentences = brown.tagged_sents(tagset='universal')

In [4]:
# Feature extraction function
def word2features(sent, i):
    word = sent[i][0]
    # Basic word features
    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    
    # Features from previous word
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    # Features from next word
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Extract features for an entire sentence
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# Extract the labels (tags) for an entire sentence
def sent2labels(sent):
    return [label for (word, label) in sent]


In [5]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Prepare training data
X_train = [sent2features(s) for s in tagged_sentences]
y_train = [sent2labels(s) for s in tagged_sentences]

# Define and train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


In [6]:
# Tokenize the input sentence
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def predict_pos(sentence):
    tokens = word_tokenize(sentence)
    # Create a dummy sentence with word tokens (without POS tags)
    test_sent = [(token, 'X') for token in tokens]  # 'X' is just a placeholder
    features = sent2features(test_sent)
    predicted_labels = crf.predict([features])[0]
    return list(zip(tokens, predicted_labels))

# Example sentence
sentence = "This is a test sentence for POS tagging."
result = predict_pos(sentence)

# Output the result
print(result)

[('This', 'DET'), ('is', 'VERB'), ('a', 'DET'), ('test', 'NOUN'), ('sentence', 'NOUN'), ('for', 'ADP'), ('POS', 'NOUN'), ('tagging', 'VERB'), ('.', '.')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
def predict_pos(sentence, pos_tagger):
    tokens = [token for token, _ in sentence]
    features = sent2features(sentence)
    predicted_labels = crf.predict([features])[0]
    return list(zip(tokens, predicted_labels))

# predict_pos(tagged_sentences[0], crf)

def predict_pos_new(sentence):
    tokens = word_tokenize(sentence)
    sentence = []
    sentence = [(token, _) for token in tokens]
    features = sent2features(sentence)
    predicted_labels = crf.predict([features])[0]
    return list(zip(tokens, predicted_labels))

In [9]:
print(tagged_sentences[0])

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]


In [18]:
my_sent = "He will bank on their support."
print(predict_pos_new(my_sent))

[('He', 'PRON'), ('will', 'VERB'), ('bank', 'VERB'), ('on', 'ADP'), ('their', 'DET'), ('support', 'NOUN'), ('.', '.')]


In [19]:
import gradio as gr

demo = gr.Interface(fn = predict_pos_new, inputs=gr.Textbox(label="Enter your text"),  # Text input component
       outputs=gr.JSON(label="Processed Output"))
demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://742bca6536f39c444b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


