**TASK 8.1**

In [1]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm
from nltk.classify import MaxentClassifier
from nltk.classify.util import accuracy as nltk_accuracy
import random
import spacy

# Download necessary datasets
nltk.download('treebank')
nltk.download('universal_tagset')

# -------------------------------
# Step 1: Load Data
# -------------------------------
tagged_sentences = list(treebank.tagged_sents(tagset='universal'))
random.shuffle(tagged_sentences)

train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]

# -------------------------------
# Step 2: HMM + Viterbi
# -------------------------------
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_sents)

hmm_accuracy = hmm_tagger.evaluate(test_sents)
print("HMM (Viterbi) Accuracy:", hmm_accuracy)

# -------------------------------
# Step 3: Log-Linear Model (MaxEnt)
# -------------------------------

# Feature extractor
def extract_features(sentence, index):
    word = sentence[index][0]
    return {
        'word': word,
        'suffix(2)': word[-2:],
        'prefix(1)': word[0],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0]
    }

# Prepare training data
train_data = []
for sent in train_sents:
    for i in range(len(sent)):
        features = extract_features(sent, i)
        label = sent[i][1]
        train_data.append((features, label))

test_data = []
for sent in test_sents:
    for i in range(len(sent)):
        features = extract_features(sent, i)
        label = sent[i][1]
        test_data.append((features, label))

# Train MaxEnt classifier (log-linear model)
maxent_classifier = MaxentClassifier.train(train_data, max_iter=10)

# Evaluate
maxent_accuracy = nltk_accuracy(maxent_classifier, test_data)
print("Log-Linear (MaxEnt) Accuracy:", maxent_accuracy)

# -------------------------------
# Step 4: Comparison
# -------------------------------
if maxent_accuracy > hmm_accuracy:
    print("Log-Linear Model performs better.")
else:
    print("HMM with Viterbi performs better.")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_accuracy = hmm_tagger.evaluate(test_sents)
  O[i, k] = self._output_logprob(si, self._symbols[k])
  O[i, k] = self._output_logprob(si, self._symbols[k])


HMM (Viterbi) Accuracy: 0.4488950276243094
  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.48491        0.065
             2          -0.98007        0.815
             3          -0.67044        0.921
             4          -0.51957        0.948
             5          -0.43023        0.960
             6          -0.37041        0.967
             7          -0.32708        0.971
             8          -0.29396        0.975
             9          -0.26766        0.978
         Final          -0.24619        0.980
Log-Linear (MaxEnt) Accuracy: 0.9408050513022889
Log-Linear Model performs better.


**TAsk 8.2**

In [2]:
import nltk
from nltk.corpus import treebank
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import random

# Download required data
nltk.download('treebank')
nltk.download('universal_tagset')

# Load dataset
sentences = list(treebank.tagged_sents(tagset='universal'))
random.shuffle(sentences)

# Train-Test Split
train_data = sentences[:3000]
test_data = sentences[3000:]

######################################
# Part 1: HMM with Viterbi Decoding
######################################
from nltk.tag import hmm

trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_data)

hmm_accuracy = hmm_tagger.evaluate(test_data)
print("HMM (Viterbi) Accuracy:", hmm_accuracy)

######################################
# Part 2: Log-Linear Model (MaxEnt)
######################################

# Feature extractor
def extract_features(sentence, i):
    word = sentence[i]
    features = {
        'word': word,
        'is_capitalized': word[0].isupper(),
        'is_digit': word.isdigit(),
        'prefix-1': word[0],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
    }
    if i > 0:
        features['prev_word'] = sentence[i-1]
    else:
        features['prev_word'] = '<START>'
    return features

# Prepare train and test sets
def prepare_dataset(tagged_sents):
    X, y = [], []
    for sent in tagged_sents:
        words, tags = zip(*sent)
        for i in range(len(words)):
            feats = extract_features(words, i)
            X.append(feats)
            y.append(tags[i])
    return X, y

X_train, y_train = prepare_dataset(train_data)
X_test, y_test = prepare_dataset(test_data)

# Convert features to dict vector
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=True)

X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# Train log-linear model (MaxEnt = Logistic Regression)
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)

log_linear_accuracy = accuracy_score(y_test, y_pred)
print("Log-Linear Model Accuracy:", log_linear_accuracy)

######################################
# Part 3: Comparison
######################################
print("\nPerformance Comparison:")
print(f"HMM (Viterbi): {hmm_accuracy:.4f}")
print(f"Log-Linear Model: {log_linear_accuracy:.4f}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_accuracy = hmm_tagger.evaluate(test_data)


HMM (Viterbi) Accuracy: 0.4535505534137545
Log-Linear Model Accuracy: 0.9578825187580318

Performance Comparison:
HMM (Viterbi): 0.4536
Log-Linear Model: 0.9579


**Task 9**

In [10]:
import nltk
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Download dataset
nltk.download('treebank')
nltk.download('universal_tagset')
nltk.download('punkt')

sentences = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

# Prepare words and tags
words = list(set(w for s in sentences for (w, t) in s))
tags = list(set(t for s in sentences for (w, t) in s))
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1
tag2idx = {t: i for i, t in enumerate(tags)}

X = [[word2idx.get(w, 1) for (w, t) in s] for s in sentences]
y = [[tag2idx[t] for (w, t) in s] for s in sentences]

max_len = 50
X = pad_sequences(X, maxlen=max_len, padding="post")
y = pad_sequences(y, maxlen=max_len, padding="post")
y = [to_categorical(i, num_classes=len(tags)) for i in y]

# Small BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X, np.array(y), batch_size=32, epochs=1, verbose=1)  # small training

# ---- Read text from file ----
file_path = "/content/drive/MyDrive/NLP LAB TASK/NLP TASK9.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

# Download extra tokenizer resource
nltk.download("punkt_tab")

# Tokenize
tokens = nltk.word_tokenize(text)
test_seq = [word2idx.get(w, 1) for w in tokens]
test_seq = pad_sequences([test_seq], maxlen=max_len, padding="post")

# Predict tags
pred = model.predict(test_seq)[0]
pred_tags = [list(tag2idx.keys())[np.argmax(p)] for p in pred][:len(tokens)]

print("\nTagged Output:")
for word, tag in zip(tokens, pred_tags):
    print(f"{word} --> {tag}")

# Extract nouns, proper nouns, numbers
extracted_info = [word for word, tag in zip(tokens, pred_tags) if tag in ["NOUN", "PROPN", "NUM"]]
print("\nExtracted Information:", extracted_info)




[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.3980 - loss: 2.0796


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 623ms/step

Tagged Output:
Elon --> NOUN
Musk --> NOUN
founded --> NOUN
SpaceX --> NOUN
in --> ADP
2002 --> NOUN
and --> CONJ
lives --> NOUN
in --> ADP
Texas --> NOUN
. --> .

Extracted Information: ['Elon', 'Musk', 'founded', 'SpaceX', '2002', 'lives', 'Texas']


**Aim and Algorithm**

**Aim**

To build a POS Tagger for unstructured text using BiLSTM in Keras and extract nouns, proper nouns, and numbers from a file.


**Algorithm**

Import the required libraries and download NLTK datasets.

Load the Treebank corpus, create word–tag mappings, and prepare sequences with padding.

Build and train a BiLSTM POS tagging model using Keras.

Read the unstructured text from the given file path and tokenize it.

Convert tokens into sequences, predict POS tags, and extract nouns, proper nouns, and numbers.

Display the tagged output and the extracted information.