## Autocorrect System & HMM POS Tagger

In [1]:
import numpy as np
from collections import defaultdict, Counter
import re
import string

import nltk
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('punkt')

from nltk.corpus import brown

[nltk_data] Downloading package brown to /Users/dongnd/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/dongnd/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dongnd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# PART 2: HMM POS TAGGER

## 2.1. Load và Preprocess Data

In [2]:
def load_brown_corpus(tagset='universal'):
    """
    Load Brown corpus with POS tags.
    
    Args:
        tagset: 'universal' (17 tags) or None (Penn Treebank)
    
    Returns:
        tagged_sents: list of sentences, each sentence is a list of (word, tag) tuples
    """
    tagged_sents = brown.tagged_sents(tagset=tagset)
    
    processed = []
    for sent in tagged_sents:
        processed_sent = [(word.lower(), tag) for word, tag in sent]
        processed.append(processed_sent)
    
    return processed

def split_data(data, train_ratio=0.8):
    """
    Split data into train and test sets.
    """
    split_idx = int(len(data) * train_ratio)
    return data[:split_idx], data[split_idx:]

print("Loading Brown corpus...")
tagged_sentences = load_brown_corpus(tagset='universal')
train_data, test_data = split_data(tagged_sentences)

print(f"Total sentences: {len(tagged_sentences):,}")
print(f"Training sentences: {len(train_data):,}")
print(f"Test sentences: {len(test_data):,}")

print("\nSample sentence:")
sample = train_data[0][:10]
for word, tag in sample:
    print(f"  {word:15} → {tag}")

Loading Brown corpus...
Total sentences: 57,340
Training sentences: 45,872
Test sentences: 11,468

Sample sentence:
  the             → DET
  fulton          → NOUN
  county          → NOUN
  grand           → ADJ
  jury            → NOUN
  said            → VERB
  friday          → NOUN
  an              → DET
  investigation   → NOUN
  of              → ADP


## 2.2. Extract Tags and Vocabulary

In [3]:
def extract_vocab_and_tags(tagged_sents):
    """
    Extract vocabulary and set of tags from tagged corpus.
    """
    words = set()
    tags = set()
    
    for sent in tagged_sents:
        for word, tag in sent:
            words.add(word)
            tags.add(tag)
    
    return words, tags

word_vocab, tag_set = extract_vocab_and_tags(train_data)

print(f"Vocabulary size: {len(word_vocab):,}")
print(f"Number of tags: {len(tag_set)}")
print(f"\nTags: {sorted(tag_set)}")

Vocabulary size: 45,755
Number of tags: 12

Tags: ['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


## 2.3. HMM Tagger Implementation

In [4]:
class HMMTagger:
    """
    Hidden Markov Model POS Tagger.
    """
    
    def __init__(self, smoothing_k=1.0):
        """
        Initialize HMM Tagger.
        
        Args:
            smoothing_k: smoothing coefficient
        """
        self.smoothing_k = smoothing_k
        
        self.tag_counts = Counter()           # C(tag)
        self.transition_counts = defaultdict(Counter)  # C(tag_i, tag_j)
        self.emission_counts = defaultdict(Counter)    # C(tag, word)
        self.initial_counts = Counter()       # C(tag tại vị trí đầu)
        
        # Vocabulary and tag set
        self.vocab = set()
        self.tags = set()
        
        # Probabilities (will be calculated after training)
        self.transition_probs = {}  # A matrix
        self.emission_probs = {}    # B matrix
        self.initial_probs = {}     # π vector
    
    def train(self, tagged_sents):
        """
        Train HMM from tagged corpus.
        
        Args:
            tagged_sents: list of sentences, mỗi sentence là list of (word, tag)
        """
        # Step 1: Count events
        for sent in tagged_sents:
            prev_tag = '<START>'  # Special start tag
            
            for i, (word, tag) in enumerate(sent):
                # Update tag count
                self.tag_counts[tag] += 1
                self.tags.add(tag)
                self.vocab.add(word)
                
                # Update emission count: C(tag, word)
                self.emission_counts[tag][word] += 1
                
                # Update transition count: C(prev_tag, tag)
                self.transition_counts[prev_tag][tag] += 1
                
                # Update initial count (only for the first tag)
                if i == 0:
                    self.initial_counts[tag] += 1
                
                prev_tag = tag
        
        # Add <START> to tags for transition
        self.tags.add('<START>')
        
        # Step 2: Calculate probabilities with smoothing
        self._compute_probabilities()
        
        print(f"Training completed!")
        print(f"  Vocabulary size: {len(self.vocab):,}")
        print(f"  Number of tags: {len(self.tags) - 1}")  # Exclude <START>
    
    def _compute_probabilities(self):
        """
        Calculate transition, emission, and initial probabilities with Laplace smoothing.
        """
        k = self.smoothing_k
        num_tags = len(self.tags)
        vocab_size = len(self.vocab)
        
        # Transition probabilities: P(tag_j | tag_i)
        # P(j|i) = (C(i,j) + k) / (C(i) + k * num_tags)
        for tag_i in self.tags:
            total_i = sum(self.transition_counts[tag_i].values())
            self.transition_probs[tag_i] = {}
            
            for tag_j in self.tags:
                if tag_j == '<START>':
                    continue
                count = self.transition_counts[tag_i].get(tag_j, 0)
                self.transition_probs[tag_i][tag_j] = (count + k) / (total_i + k * num_tags)
        
        # Emission probabilities: P(word | tag)
        # P(w|t) = (C(t,w) + k) / (C(t) + k * vocab_size)
        for tag in self.tags:
            if tag == '<START>':
                continue
            total_tag = self.tag_counts[tag]
            self.emission_probs[tag] = {}
            
            # Only save words with count > 0 to save memory
            for word in self.emission_counts[tag]:
                count = self.emission_counts[tag][word]
                self.emission_probs[tag][word] = (count + k) / (total_tag + k * vocab_size)
            
            # Save default probability for unknown words
            self.emission_probs[tag]['<UNK>'] = k / (total_tag + k * vocab_size)
        
        # Initial probabilities: P(tag at the first position)
        total_sents = sum(self.initial_counts.values())
        for tag in self.tags:
            if tag == '<START>':
                continue
            count = self.initial_counts.get(tag, 0)
            self.initial_probs[tag] = (count + k) / (total_sents + k * num_tags)
    
    def get_transition_prob(self, tag_i, tag_j):
        """Get P(tag_j | tag_i)."""
        if tag_i in self.transition_probs and tag_j in self.transition_probs[tag_i]:
            return self.transition_probs[tag_i][tag_j]
        # Smoothed probability for unknown transition
        return self.smoothing_k / (self.tag_counts.get(tag_i, 0) + self.smoothing_k * len(self.tags))
    
    def get_emission_prob(self, tag, word):
        """Get P(word | tag)."""
        if tag in self.emission_probs:
            if word in self.emission_probs[tag]:
                return self.emission_probs[tag][word]
            return self.emission_probs[tag]['<UNK>']
        return self.smoothing_k / (self.tag_counts.get(tag, 0) + self.smoothing_k * len(self.vocab))
    
    def get_initial_prob(self, tag):
        """Get P(tag tại vị trí đầu)."""
        return self.initial_probs.get(tag, self.smoothing_k / sum(self.initial_counts.values()))
    
    def viterbi(self, sentence):
        """
        Viterbi algorithm to find the optimal sequence of tags.
        
        Args:
            sentence: list of words
        
        Returns:
            best_tags: list of predicted tags
        """
        if not sentence:
            return []
        
        # Tags to consider (exclude <START>)
        tags = [t for t in self.tags if t != '<START>']
        n_tags = len(tags)
        n_words = len(sentence)
        
        # Viterbi matrix: v[t][j] = best log prob to reach tag j at time t
        # Use log probabilities to avoid underflow
        v = np.full((n_words, n_tags), -np.inf)
        
        # Backpointer matrix
        backpointer = np.zeros((n_words, n_tags), dtype=int)
        
        # Tag to index mapping
        tag2idx = {tag: i for i, tag in enumerate(tags)}
        idx2tag = {i: tag for i, tag in enumerate(tags)}
        
        # Step 1: Initialization (t = 0)
        word = sentence[0].lower()
        for j, tag in enumerate(tags):
            # v[0][j] = log(π[j]) + log(b[j][word])
            init_prob = self.get_initial_prob(tag)
            emit_prob = self.get_emission_prob(tag, word)
            
            v[0, j] = np.log(init_prob + 1e-10) + np.log(emit_prob + 1e-10)
        
        # Step 2: Recursion (t = 1...T-1)
        for t in range(1, n_words):
            word = sentence[t].lower()
            
            for j, tag_j in enumerate(tags):
                emit_prob = self.get_emission_prob(tag_j, word)
                log_emit = np.log(emit_prob + 1e-10)
                
                # Find the best state
                best_score = -np.inf
                best_prev = 0
                
                for i, tag_i in enumerate(tags):
                    trans_prob = self.get_transition_prob(tag_i, tag_j)
                    score = v[t-1, i] + np.log(trans_prob + 1e-10)
                    
                    if score > best_score:
                        best_score = score
                        best_prev = i
                
                v[t, j] = best_score + log_emit
                backpointer[t, j] = best_prev
        
        # Step 3: Termination
        best_last_tag_idx = np.argmax(v[n_words - 1])
        
        # Step 4: Backtrace
        best_tags = [idx2tag[best_last_tag_idx]]
        
        for t in range(n_words - 1, 0, -1):
            best_last_tag_idx = backpointer[t, best_last_tag_idx]
            best_tags.append(idx2tag[best_last_tag_idx])
        
        best_tags.reverse()
        return best_tags
    
    def tag(self, sentence):
        """
        Tag a sentence.
        
        Args:
            sentence: list of words hoặc string
        
        Returns:
            list of (word, tag) tuples
        """
        if isinstance(sentence, str):
            sentence = sentence.split()
        
        tags = self.viterbi(sentence)
        return list(zip(sentence, tags))
    
    def evaluate(self, test_sents):
        """
        Evaluate accuracy on test set.
        
        Args:
            test_sents: list of tagged sentences
        
        Returns:
            accuracy: float
        """
        correct = 0
        total = 0
        
        for sent in test_sents:
            words = [w for w, t in sent]
            gold_tags = [t for w, t in sent]
            pred_tags = self.viterbi(words)
            
            for gold, pred in zip(gold_tags, pred_tags):
                if gold == pred:
                    correct += 1
                total += 1
        
        return correct / total if total > 0 else 0

## 2.4. Train và Evaluate HMM Tagger

In [5]:
hmm_tagger = HMMTagger(smoothing_k=1.0)
hmm_tagger.train(train_data)

Training completed!
  Vocabulary size: 45,755
  Number of tags: 12


In [6]:
# Test trên một vài câu
test_sentences = [
    "The cat sat on the mat",
    "I want to learn natural language processing",
    "The quick brown fox jumps over the lazy dog",
]

print("Sample Tagging Results:")

for sent in test_sentences:
    tagged = hmm_tagger.tag(sent)
    print(f"\nSentence: {sent}")
    print("Tags:")
    for word, tag in tagged:
        print(f"  {word:15} -> {tag}")

Sample Tagging Results:

Sentence: The cat sat on the mat
Tags:
  The             -> DET
  cat             -> NOUN
  sat             -> VERB
  on              -> ADP
  the             -> DET
  mat             -> NOUN

Sentence: I want to learn natural language processing
Tags:
  I               -> PRON
  want            -> VERB
  to              -> PRT
  learn           -> VERB
  natural         -> ADJ
  language        -> NOUN
  processing      -> VERB

Sentence: The quick brown fox jumps over the lazy dog
Tags:
  The             -> DET
  quick           -> ADJ
  brown           -> NOUN
  fox             -> NOUN
  jumps           -> VERB
  over            -> ADP
  the             -> DET
  lazy            -> ADJ
  dog             -> NOUN


In [7]:
print("Evaluating on test set...")

test_subset = test_data[:1000]
accuracy = hmm_tagger.evaluate(test_subset)

print(f"\nAccuracy on test subset ({len(test_subset)} sentences): {accuracy:.2%}")

Evaluating on test set...

Accuracy on test subset (1000 sentences): 93.17%


## 2.5. Analyzing Transition và Emission Probabilities

In [8]:
interesting_transitions = [
    ('DET', 'NOUN'),    # the cat
    ('DET', 'ADJ'),     # the big
    ('ADJ', 'NOUN'),    # big cat
    ('NOUN', 'VERB'),   # cat runs
    ('VERB', 'DET'),    # runs the
    ('PRON', 'VERB'),   # I run
]

for tag_i, tag_j in interesting_transitions:
    prob = hmm_tagger.get_transition_prob(tag_i, tag_j)
    print(f"  P({tag_j} | {tag_i}) = {prob:.4f}")

  P(NOUN | DET) = 0.6190
  P(ADJ | DET) = 0.2453
  P(NOUN | ADJ) = 0.6644
  P(VERB | NOUN) = 0.1561
  P(DET | VERB) = 0.1636
  P(VERB | PRON) = 0.7170


In [9]:
print("\nTop 5 words for each tag:")

for tag in sorted(hmm_tagger.tags):
    if tag == '<START>':
        continue
    
    # Get top words
    tag_words = hmm_tagger.emission_counts[tag].most_common(5)
    words_str = ", ".join([f"{w}({c})" for w, c in tag_words])
    print(f"  {tag:6}: {words_str}")


Top 5 words for each tag:
  .     : ,(48491), .(39534), ``(6160), ''(6119), ;(4884)
  ADJ   : new(1528), other(1507), first(923), many(921), such(913)
  ADP   : of(32940), in(18607), to(9686), for(8380), with(6153)
  ADV   : not(3933), when(1879), so(1222), only(1154), more(1077)
  CONJ  : and(24278), or(3798), but(3311), either(175), nor(169)
  DET   : the(61188), a(19375), his(5215), this(4567), an(3285)
  NOUN  : time(1286), af(994), man(879), years(843), state(772)
  NUM   : one(2374), two(1224), three(534), 1(525), 2(443)
  PRON  : it(6949), he(6685), i(3107), they(2899), we(2291)
  PRT   : to(12732), all(2277), there(1827), up(1224), out(1189)
  VERB  : is(9670), was(7334), be(5724), are(4188), had(3511)
  X     : de(63), la(34), et(25), comedie(14), quo(11)


## 2.6. Error Analysis

In [10]:
def analyze_errors(tagger, test_sents, n_samples=5):
    """
    Analyze tagging errors.
    """
    errors = []
    
    for sent in test_sents[:500]:  # Check first 500 sentences
        words = [w for w, t in sent]
        gold_tags = [t for w, t in sent]
        pred_tags = tagger.viterbi(words)
        
        for i, (word, gold, pred) in enumerate(zip(words, gold_tags, pred_tags)):
            if gold != pred:
                # Get context
                context = " ".join(words[max(0,i-2):i+3])
                errors.append({
                    'word': word,
                    'gold': gold,
                    'pred': pred,
                    'context': context
                })
    
    return errors

errors = analyze_errors(hmm_tagger, test_data)

print(f"Total errors found: {len(errors)}")
print("\nSample Errors:")

for err in errors[:10]:
    print(f"  Word: '{err['word']}'")
    print(f"  Gold: {err['gold']} | Predicted: {err['pred']}")
    print(f"  Context: ...{err['context']}...")
    print()

Total errors found: 499

Sample Errors:
  Word: 'because'
  Gold: ADP | Predicted: ADV
  Context: ...important , because of the...

  Word: 'beech'
  Gold: NOUN | Predicted: ADJ
  Context: ...of the beech pasture ....

  Word: 'that'
  Gold: DET | Predicted: ADP
  Context: ...what's that ? ?...

  Word: 'as-it-were'
  Gold: ADV | Predicted: VERB
  Context: ...gave me as-it-were the spirit...

  Word: 'demoniac'
  Gold: ADJ | Predicted: NOUN
  Context: ..., the demoniac , evil...

  Word: 'whole'
  Gold: NOUN | Predicted: ADJ
  Context: ...of this whole affair ....

  Word: 'registrar'
  Gold: NOUN | Predicted: ADV
  Context: ...am also registrar ....

  Word: 'blows'
  Gold: VERB | Predicted: NOUN
  Context: ...live wind blows , and...

  Word: 'water-line'
  Gold: NOUN | Predicted: ADJ
  Context: ...below the water-line interests me...

  Word: 'interests'
  Gold: VERB | Predicted: NOUN
  Context: ...the water-line interests me also...



In [11]:
# Confusion analysis
from collections import defaultdict

confusion = defaultdict(lambda: defaultdict(int))

for err in errors:
    confusion[err['gold']][err['pred']] += 1

print("Most common confusions:")

confusion_list = []
for gold in confusion:
    for pred, count in confusion[gold].items():
        confusion_list.append((gold, pred, count))

confusion_list.sort(key=lambda x: x[2], reverse=True)

for gold, pred, count in confusion_list[:10]:
    print(f"  {gold} -> {pred}: {count} times")

Most common confusions:
  VERB -> NOUN: 42 times
  PRT -> ADP: 38 times
  NOUN -> VERB: 32 times
  NOUN -> PRON: 31 times
  ADJ -> ADV: 30 times
  NOUN -> ADJ: 29 times
  ADJ -> NOUN: 22 times
  NOUN -> DET: 20 times
  ADV -> ADJ: 19 times
  ADP -> ADV: 17 times
