In [None]:
import numpy as np
import hmmlearn.hmm as hmm
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support
import chardet



def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        raw_data = f.read(10000)
        detected_encoding = chardet.detect(raw_data)["encoding"]
        return detected_encoding


def load_data(file_path, encoding=None):
    if encoding is None:
        encoding = detect_encoding(file_path)
        print(f"Detected Encoding: {encoding}")

    sentences, labels = [], []
    sentence, label_seq = [], []

    with open(file_path, 'r', encoding=encoding, errors="replace") as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[-1]
                    sentence.append(token)
                    label_seq.append(label)
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence, label_seq = [], []

    if sentence:
        sentences.append(sentence)
        labels.append(label_seq)

    return sentences, labels



def create_vocab(sentences):
    word_counts = Counter(word for sent in sentences for word in sent)
    vocab = {word for word, count in word_counts.items() if count > 1}
    word2idx = {word: idx for idx, word in enumerate(vocab, start=1)}
    word2idx["UNK"] = 0
    return word2idx



def create_label_dict(labels):
    unique_labels = sorted(set(label for lbl_seq in labels for label in lbl_seq))
    label2idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx2label = {idx: label for label, idx in label2idx.items()}
    return label2idx, idx2label


def prepare_sequences(sentences, labels, word2idx, label2idx):
    X = [[word2idx.get(word, 0) for word in sent] for sent in sentences]
    Y = [[label2idx[label] for label in lbl_seq] for lbl_seq in labels]
    return X, Y


def train_hmm(X, Y, n_states):
    lengths = [len(seq) for seq in X]
    X_flat = np.concatenate(X).reshape(-1, 1)

    model = hmm.MultinomialHMM(n_components=n_states, n_iter=100, tol=0.01)
    model.fit(X_flat, lengths)
    return model



def predict_hmm(model, X):
    predictions = []
    for seq in X:
        seq = np.array(seq).reshape(-1, 1)
        preds = model.predict(seq)
        predictions.append(preds)
    return predictions



def evaluate(predictions, Y, idx2label):
    y_true = np.concatenate(Y)
    y_pred = np.concatenate(predictions)
    y_true = [idx2label[idx] for idx in y_true]
    y_pred = [idx2label[idx] for idx in y_pred]

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}")



def predict_new_sentence(model, sentence, word2idx, idx2label):
    sentence_idx = [word2idx.get(word, 0) for word in sentence]
    sentence_idx = np.array(sentence_idx).reshape(-1, 1)
    predictions = model.predict(sentence_idx)
    return [idx2label[idx] for idx in predictions]



file_path = r"/content/ner.txt"


sentences, labels = load_data(file_path)


word2idx = create_vocab(sentences)
label2idx, idx2label = create_label_dict(labels)


X, Y = prepare_sequences(sentences, labels, word2idx, label2idx)


n_states = len(label2idx)
hmm_model = train_hmm(X, Y, n_states)


y_pred = predict_hmm(hmm_model, X)
evaluate(y_pred, Y, idx2label)

test_sentence = ["Ibuprofen", "is", "used", "to", "reduce", "inflammation", "."]
predicted_labels = predict_new_sentence(hmm_model, test_sentence, word2idx, idx2label)

print("Test Sentence:", test_sentence)
print("Predicted Labels:", predicted_labels)

https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


Detected Encoding: ascii
Precision: 0.751, Recall: 0.818, F1-score: 0.783
Test Sentence: ['Ibuprofen', 'is', 'used', 'to', 'reduce', 'inflammation', '.']
Predicted Labels: ['D', 'O', 'O', 'O', 'O', 'O', 'O']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The HMM model prefers to predict "O" for the majority of tokens because of the dataset's notable class imbalance, where the label "O" is highly popular.  This happens because the high frequency of "O" has a significant impact on the predicted probabilities.  In order to enhance performance and attain more precise forecasts for labels such as "T" and "D," it is imperative to address this imbalance.  To guarantee a more representative learning process, this can be accomplished by balancing the dataset, using methods like smoothing, re-weighting, or improving the feature set.
