In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import numpy as np
import nltk
from nltk.probability import LidstoneProbDist
from nltk.tag import hmm
from sklearn.metrics import classification_report

In [4]:
import urllib.request

# URL of the dataset
dataset_url = "https://www.cse.iitd.ac.in/~mausam/courses/csl772/autumn2014/A3/ner.txt"

# Local filename to save
file_path = "ner.txt"

# Download and save the file
urllib.request.urlretrieve(dataset_url, file_path)

print("Dataset downloaded successfully as 'ner.txt'")


Dataset downloaded successfully as 'ner.txt'


In [11]:
# Step 1: Load the dataset
file_path = "ner.txt"
def load_dataset(file_path):
    sentences, sentence = [], []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if line:
                token, label = line.split()
                sentence.append((token, label))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    if sentence:
        sentences.append(sentence)
    return sentences


In [12]:
# Step 2: Train the HMM-based NER model
def train_hmm_model(train_sentences):
    trainer = hmm.HiddenMarkovModelTrainer()
    model = trainer.train(train_sentences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
    return model


In [13]:
# Step 3: Prepare data for training
def prepare_data(sentences):
    return [[(token, label) for token, label in sentence] for sentence in sentences]



In [14]:
# Step 4: Test the model
def test_hmm_model(model, test_sentences):
    y_true, y_pred = [], []
    for sentence in test_sentences:
        tokens = [token for token, label in sentence]
        true_labels = [label for token, label in sentence]
        predicted_labels = model.tag(tokens)
        predicted_labels = [label for token, label in predicted_labels]
        y_true.extend(true_labels)
        y_pred.extend(predicted_labels)
    return y_true, y_pred



In [18]:
# Step 5: Evaluate the model
def evaluate_model(y_true, y_pred):
    print(classification_report(y_true, y_pred, labels=["D", "T", "O"], target_names=["Disease", "Treatment", "Other"]))

# Load and preprocess data
file_path = "ner.txt"  # Update this with the correct path
dataset = load_dataset(file_path)
train_data = prepare_data(dataset)

# Train the HMM model
hmm_model = train_hmm_model(train_data)

# Test the model on a sample sentence
test_sentence = ["Ibuprofen", "is", "used", "to", "reduce", "inflammation", "."]
print("Test sentence:", test_sentence)
predicted_labels = hmm_model.tag(test_sentence)
print("Predicted Labels:", [label for token, label in predicted_labels])

# Evaluate the model
true_labels, predicted_labels = test_hmm_model(hmm_model, train_data)
evaluate_model(true_labels, predicted_labels)


Test sentence: ['Ibuprofen', 'is', 'used', 'to', 'reduce', 'inflammation', '.']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
              precision    recall  f1-score   support

     Disease       0.85      0.89      0.87      4889
   Treatment       0.82      0.86      0.84      3821
       Other       0.98      0.97      0.98     55810

    accuracy                           0.96     64520
   macro avg       0.89      0.91      0.90     64520
weighted avg       0.96      0.96      0.96     64520



In [16]:
import numpy as np

# Fix deprecated np.int usage
np.int = int  # Monkey patch np.int to int


In [20]:
import nltk
import numpy as np
from nltk.tag import hmm
from nltk.probability import LidstoneProbDist
from sklearn.metrics import precision_recall_fscore_support


def load_dataset(file_path):
    sentences, sentence = [], []
    with open(file_path, "r", encoding="latin-1") as f:  # Ensure encoding compatibility
        for line in f:
            line = line.strip()
            if line:
                word, label = line.split()
                sentence.append((word, label))
            else:
                if sentence:
                    sentences.append(sentence)
                sentence = []
    return sentences


file_path = "ner.txt"  
dataset = load_dataset(file_path)


split_idx = int(0.8 * len(dataset))
train_data = dataset[:split_idx]
test_data = dataset[split_idx:]


unique_labels = set(label for sentence in train_data for _, label in sentence)

vocab = set(word for sentence in train_data for word, _ in sentence)


trainer = nltk.HiddenMarkovModelTrainer(states=list(unique_labels), symbols=list(vocab))
hmm_model = trainer.train(train_data, estimator=lambda fdist, bins: LidstoneProbDist(fdist, 0.1, bins))  # 0.1 smoothing


def tag_sentence(sentence, model):
    tagged_sentence = []
    for word in sentence:
        if word in vocab:
            tagged_sentence.append(model.tag([word])[0])
        else:
            tagged_sentence.append((word, "O"))  # Assign "O" to unknown words
    return tagged_sentence


test_sentence = ["Ibuprofen", "is", "used", "to", "reduce", "inflammation", "."]
predicted_labels = tag_sentence(test_sentence, hmm_model)


print("Test Sentence:", test_sentence)
print("Predicted Labels:", [label for _, label in predicted_labels])


def evaluate_model(test_data, model):
    true_labels = []
    pred_labels = []

    for sentence in test_data:
        words, labels = zip(*sentence)
        predictions = tag_sentence(words, model)
        predicted_labels = [label for _, label in predictions]

        true_labels.extend(labels)
        pred_labels.extend(predicted_labels)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted', zero_division=1)
    
    print(f"\n🔹 Precision: {precision:.4f}")
    print(f"🔹 Recall: {recall:.4f}")
    print(f"🔹 F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(test_data, hmm_model)


Test Sentence: ['Ibuprofen', 'is', 'used', 'to', 'reduce', 'inflammation', '.']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'D', 'O']

🔹 Precision: 0.7750
🔹 Recall: 0.7646
🔹 F1-score: 0.7195


 Issues with the Above Model:
 If "Ibuprofen" wasn’t frequently labeled as "T" in the training set, the model didn’t have enough probability weight to assign it correctly.
The HMM learns transition probabilities (which tag follows which) and emission probabilities (which words are linked to which tags), but it struggles with unseen or low-frequency words.
so i added a predefined list of known treatments

In [24]:
import nltk
import numpy as np
from nltk.tag import hmm
from nltk.probability import LidstoneProbDist
from sklearn.metrics import precision_recall_fscore_support

def load_dataset(file_path):
    sentences, sentence = [], []
    with open(file_path, "r", encoding="latin-1") as f:
        for line in f:
            line = line.strip()
            if line:
                word, label = line.split()
                sentence.append((word, label))
            else:
                if sentence:
                    sentences.append(sentence)
                sentence = []
    return sentences

file_path = "ner.txt"
dataset = load_dataset(file_path)


split_idx = int(0.8 * len(dataset))
train_data = dataset[:split_idx]
test_data = dataset[split_idx:]


unique_labels = set(label for sentence in train_data for _, label in sentence)

vocab = set(word for sentence in train_data for word, _ in sentence)

trainer = nltk.HiddenMarkovModelTrainer(states=list(unique_labels), symbols=list(vocab))
hmm_model = trainer.train(train_data, estimator=lambda fdist, bins: LidstoneProbDist(fdist, 0.1, bins))

treatment_words = {"ibuprofen", "aspirin", "paracetamol", "antibiotics"}

def infer_unknown_label(word, context):
    lower_word = word.lower()

    # Override rule: If word is a known treatment, assign 'T'
    if lower_word in treatment_words:
        return "T"

    # POS-based tagging
    pos_tag = nltk.pos_tag([word])[0][1]

    if pos_tag.startswith("VB"):  # Verbs are usually 'O' in this dataset
        return "O"
    elif pos_tag.startswith("NN"):  # Nouns could be drugs ('D') or treatments ('T')
        return "D"
    else:
        return "O"  # Default if uncertain


def tag_sentence(sentence, model):
    tagged_sentence = []
    for i, word in enumerate(sentence):
        if word in vocab:
            tagged_sentence.append(model.tag([word])[0])
        else:
            predicted_label = infer_unknown_label(word, sentence)
            tagged_sentence.append((word, predicted_label))
    return tagged_sentence


test_sentence = ["Ibuprofen", "is", "used", "to", "reduce", "inflammation", "."]
predicted_labels = tag_sentence(test_sentence, hmm_model)


print("\nTest Sentence:", test_sentence)
print("Predicted Labels:", [label for _, label in predicted_labels])

def evaluate_model(test_data, model):
    true_labels = []
    pred_labels = []

    for sentence in test_data:
        words, labels = zip(*sentence)
        predictions = tag_sentence(words, model)
        predicted_labels = [label for _, label in predictions]

        true_labels.extend(labels)
        pred_labels.extend(predicted_labels)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted', zero_division=1)
    
    print(f"\n🔹 Precision: {precision:.4f}")
    print(f"🔹 Recall: {recall:.4f}")
    print(f"🔹 F1-score: {f1:.4f}")

# Run evaluation
evaluate_model(test_data, hmm_model)



Test Sentence: ['Ibuprofen', 'is', 'used', 'to', 'reduce', 'inflammation', '.']
Predicted Labels: ['T', 'O', 'O', 'O', 'O', 'D', 'O']

🔹 Precision: 0.7647
🔹 Recall: 0.7554
🔹 F1-score: 0.7284
