## Autocorrect System & HMM POS Tagger

In [None]:
import numpy as np
from collections import defaultdict, Counter
import re
import string

import nltk
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('punkt')

from nltk.corpus import brown

# PART 1: AUTOCORRECT SYSTEM

## 1.1. Xây dựng Vocabulary từ Corpus

In [None]:
def load_corpus():
    """
    Load and preprocess text corpus from NLTK Brown corpus.
    Returns:
        words: list of all words in corpus (lowercased)
    """
    words = brown.words()
    
    words = [word.lower() for word in words if word.isalpha()]
    
    return words

def build_vocab(words):
    """
    Build vocabulary
    """
    return set(words)

def count_word_freq(words):
    """
    Count word frequency
    Returns:
        word_freq: Counter object với {word: count}
    """
    return Counter(words)

corpus_words = load_corpus()
vocab = build_vocab(corpus_words)
word_freq = count_word_freq(corpus_words)

print(f"Total words in corpus: {len(corpus_words):,}")
print(f"Vocabulary size: {len(vocab):,}")
print(f"\nTop 10 most frequent words:")
for word, count in word_freq.most_common(10):
    print(f"  {word}: {count:,}")

## 1.2. Minimum Edit Distance

In [None]:
def min_edit_distance(source, target):
    """
    Calculate Minimum Edit Distance between source and target string.
    
    Args:
        source: source string
        target: target string
    
    Returns:
        distance: minimum edit distance
    """
    m, n = len(source), len(target)
    
    # Initialize DP matrix
    # D[i][j] = edit distance between source[:i] and target[:j]
    D = np.zeros((m + 1, n + 1), dtype=int)
    
    for i in range(m + 1):
        D[i, 0] = i  # Delete i characters to become empty string
    
    for j in range(n + 1):
        D[0, j] = j  # Insert j characters from empty string
    
    # Fill the DP table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Hint: Compare source[i-1] with target[j-1]
            
            # Cost of substitution (0 if match, 1 if not)
            if source[i-1] == target[j-1]:
                sub_cost = 0
            else:
                sub_cost = 1
            
            # Calculate minimum of 3 operations
            D[i, j] = min(
                D[i-1, j] + 1,      # Delete
                D[i, j-1] + 1,      # Insert
                D[i-1, j-1] + sub_cost  # Substitute/Match
            )
    
    return D[m, n]

test_cases = [
    ("cat", "cut"),     
    ("intention", "execution"),  
    ("kitten", "sitting"),       
    ("sunday", "saturday"),      
]

print("Testing Edit Distance:")
for source, target in test_cases:
    dist = min_edit_distance(source, target)
    print(f"  '{source}' -> '{target}': {dist}")

## 1.3. Backtrace

In [None]:
def min_edit_distance_with_backtrace(source, target):
    """
    Calculate Edit Distance and return sequence of operations.
    
    Returns:
        distance: edit distance
        operations: list of (operation, position, char) tuples
    """
    m, n = len(source), len(target)
    
    D = np.zeros((m + 1, n + 1), dtype=int)
    
    # Backpointer table: 0: match, 1: substitute, 2: insert, 3: delete
    backpointer = np.zeros((m + 1, n + 1), dtype=int)
    
    for i in range(m + 1):
        D[i, 0] = i
        backpointer[i, 0] = 3  # delete
    
    for j in range(n + 1):
        D[0, j] = j
        backpointer[0, j] = 2  # insert
    
    # Fill tables
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if source[i-1] == target[j-1]:
                sub_cost = 0
            else:
                sub_cost = 1
            
            costs = [
                D[i-1, j-1] + sub_cost,  # substitute/match
                D[i, j-1] + 1,            # insert
                D[i-1, j] + 1             # delete
            ]
            
            min_cost = min(costs)
            D[i, j] = min_cost
            
            # Store backpointer
            if min_cost == costs[0]:
                if sub_cost == 0:
                    backpointer[i, j] = 0  # match
                else:
                    backpointer[i, j] = 1  # substitute
            elif min_cost == costs[1]:
                backpointer[i, j] = 2  # insert
            else:
                backpointer[i, j] = 3  # delete
    
    # Backtrace
    operations = []
    i, j = m, n
    
    while i > 0 or j > 0:
        if i == 0:
            operations.append(('INSERT', target[j-1]))
            j -= 1
        elif j == 0:
            operations.append(('DELETE', source[i-1]))
            i -= 1
        else:
            bp = backpointer[i, j]
            if bp == 0:  # match
                operations.append(('MATCH', source[i-1]))
                i -= 1
                j -= 1
            elif bp == 1:  # substitute
                operations.append(('SUBSTITUTE', f"{source[i-1]}->{target[j-1]}"))
                i -= 1
                j -= 1
            elif bp == 2:  # insert
                operations.append(('INSERT', target[j-1]))
                j -= 1
            else:  # delete
                operations.append(('DELETE', source[i-1]))
                i -= 1
    
    operations.reverse()
    return D[m, n], operations

source, target = "intention", "execution"
dist, ops = min_edit_distance_with_backtrace(source, target)
print(f"\nEdit Distance: '{source}' -> '{target}' = {dist}")
print("Operations:")
for op, char in ops:
    if op != 'MATCH':
        print(f"  {op}: {char}")

## 1.4. Candidate Generation

In [None]:
def edits_one(word):
    """
    Generate all words with edit distance = 1 with word.
    Includes: deletes, transposes, replaces, inserts
    """
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    # Create all possible splits of word
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # 1. Deletes: delete 1 character
    # "abc" → "bc", "ac", "ab"
    deletes = [left + right[1:] for left, right in splits if right]
    
    # 2. Transposes: swap 2 adjacent characters
    # "abc" → "bac", "acb"
    transposes = [left + right[1] + right[0] + right[2:] 
                  for left, right in splits if len(right) > 1]
    
    # 3. Replaces: replace 1 character
    # "abc" → "xbc", "axc", "abx" (for each letter)
    replaces = [left + c + right[1:] 
                for left, right in splits if right 
                for c in letters]
    
    # 4. Inserts: insert 1 character
    # "abc" → "xabc", "axbc", "abxc", "abcx" (for each letter)
    inserts = [left + c + right 
               for left, right in splits 
               for c in letters]
    
    return set(deletes + transposes + replaces + inserts)

def edits_two(word):
    """
    Generate all words with edit distance = 2 with word.
    By applying edits_one 2 times.
    """
    return set(e2 for e1 in edits_one(word) for e2 in edits_one(e1))

test_word = "cat"
e1 = edits_one(test_word)
e2 = edits_two(test_word)

print(f"Word: '{test_word}'")
print(f"Edits distance 1: {len(e1)} words")
print(f"Edits distance 2: {len(e2)} words")
print(f"\nSample edits_one: {list(e1)[:10]}")

## 1.5. Autocorrect Class

In [None]:
class Autocorrect:
    """
    Autocorrect system using Edit Distance and Noisy Channel Model.
    """
    
    def __init__(self, vocab, word_freq):
        """
        Initialize with vocabulary and word frequencies.
        
        Args:
            vocab: set of valid words
            word_freq: Counter object với word frequencies
        """
        self.vocab = vocab
        self.word_freq = word_freq
        self.total_words = sum(word_freq.values())
    
    def _edits_one(self, word):
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        
        deletes = [left + right[1:] for left, right in splits if right]
        transposes = [left + right[1] + right[0] + right[2:] 
                      for left, right in splits if len(right) > 1]
        replaces = [left + c + right[1:] 
                    for left, right in splits if right 
                    for c in letters]
        inserts = [left + c + right 
                   for left, right in splits 
                   for c in letters]
        
        return set(deletes + transposes + replaces + inserts)
    
    def _edits_two(self, word):
        return set(e2 for e1 in self._edits_one(word) for e2 in self._edits_one(e1))
    
    def _known(self, words):
        """Filter words that are in vocabulary."""
        return set(w for w in words if w in self.vocab)
    
    def get_candidates(self, word):
        """
        Generate list of candidates in order of priority:
        """
        if word in self.vocab:
            return {word}
        
        candidates = self._known(self._edits_one(word))
        if candidates:
            return candidates
        
        candidates = self._known(self._edits_two(word))
        if candidates:
            return candidates
        
        return {word}
    
    def word_probability(self, word):
        """
        Calculate P(word) = Language Model probability.
        Use unigram model with Laplace smoothing.
        """
        # P(word) = (count(word) + 1) / (total_words + vocab_size)
        return (self.word_freq.get(word, 0) + 1) / (self.total_words + len(self.vocab))
    
    def correct(self, word):
        """
        Return the word with the highest probability.
        Use Noisy Channel Model: argmax P(c) * P(w|c)
        But we just use P(c)
        """
        candidates = self.get_candidates(word.lower())
        
        # Select candidate with the highest probability
        return max(candidates, key=self.word_probability)
    
    def correct_with_scores(self, word):
        """
        Return top candidates with scores.
        """
        candidates = self.get_candidates(word.lower())
        
        scored = [(c, self.word_probability(c)) for c in candidates]
        scored.sort(key=lambda x: x[1], reverse=True)
        
        return scored[:5]  # Top 5

autocorrector = Autocorrect(vocab, word_freq)

test_words = [
    "speling",     # spelling
    "correc",      # correct
    "naturla",     # natural
    "languge",     # language
    "computr",     # computer
    "helo",        # hello
    "teh",         # the
]

print("Autocorrect Results:")
for word in test_words:
    correction = autocorrector.correct(word)
    print(f"  '{word}' -> '{correction}'")

print("Detailed scores for 'speling':")
for word, score in autocorrector.correct_with_scores("speling"):
    print(f"  {word}: {score:.6f}")

## 1.6. Evaluation

In [None]:
test_set = [
    ("teh", "the"),
    ("woudl", "would"),
    ("peolpe", "people"),
    ("becuase", "because"),
    ("recieve", "receive"),
    ("occured", "occurred"),
    ("seperate", "separate"),
    ("definately", "definitely"),
    ("goverment", "government"),
    ("enviroment", "environment"),
]

correct_count = 0
print("Evaluation Results:")

for misspelled, correct in test_set:
    prediction = autocorrector.correct(misspelled)
    is_correct = prediction == correct
    correct_count += is_correct
    
    status = "CORRECT" if is_correct else "INCORRECT"
    print(f"  {status} '{misspelled}' -> '{prediction}' (expected: '{correct}')")

accuracy = correct_count / len(test_set) * 100
print(f"Accuracy: {correct_count}/{len(test_set)} = {accuracy:.1f}%")