In [19]:
import json
import numpy as np
from collections import defaultdict
import time

In [25]:
def load_data(json_file):
    """Load raw data from JSON file."""
    with open(json_file, 'r') as f:
        data = json.load(f)
    return [
        (sentence.split(), tags) 
        for sentence, tags in data
    ]

def split_data(sentences, test_ratio=0.2):
    """Split data into train/test sets."""
    np.random.seed(42)
    indices = np.random.permutation(len(sentences))
    split_idx = int(len(sentences) * (1 - test_ratio))
    return [sentences[i] for i in indices[:split_idx]], [sentences[i] for i in indices[split_idx:]]

def map_to_4tags(tag):
    """Collapse 36 tags into 4 categories."""
    if tag.startswith('N'): return 'N'
    elif tag.startswith('V'): return 'V'
    elif tag.startswith('JJ') or tag.startswith('RB'): return 'A'
    else: return 'O'

def preprocess_4tag(sentences):
    """Convert tags in sentences to 4 categories."""
    return [
        ([word.lower() for word in words], [map_to_4tags(tag) for tag in tags])
        for (words, tags) in sentences
    ]


In [26]:
def create_mappings(sentences):
    """Generate tag/word indices from training data."""
    all_tags, all_words = set(), set()
    for words, tags in sentences:
        all_tags.update(tags)
        all_words.update(words)
    tag2idx = {tag: i for i, tag in enumerate(all_tags)}
    word2idx = {word: i for i, word in enumerate(all_words)}
    idx2tag = {i: tag for tag, i in tag2idx.items()}
    return tag2idx, word2idx, idx2tag, all_tags, all_words


In [27]:
def train_model(train_data, tag2idx, word2idx):
    """Train HMM with numpy matrices (add-1 smoothing)."""
    num_tags = len(tag2idx)
    num_words = len(word2idx)
    
    # Initialize matrices
    transition = np.ones((num_tags, num_tags))
    initial = np.ones(num_tags)
    emission = np.ones((num_tags, num_words))
    
    for words, tags in train_data:
        prev_tag_idx = None
        for i, (word, tag) in enumerate(zip(words, tags)):
            word_idx = word2idx.get(word.lower(), -1)
            tag_idx = tag2idx[tag]
            
            if i == 0:
                initial[tag_idx] += 1
            else:
                transition[prev_tag_idx, tag_idx] += 1
            
            if word_idx != -1:
                emission[tag_idx, word_idx] += 1
            
            prev_tag_idx = tag_idx
    
    # Normalize probabilities
    initial = np.log(initial / initial.sum())
    transition = np.log(transition / transition.sum(axis=1, keepdims=True))
    emission = np.log(emission / emission.sum(axis=1, keepdims=True))
    
    return initial, transition, emission

In [28]:
def viterbi(sentence_words, initial, transition, emission, tag2idx, word2idx):
    """Fast Viterbi decoding with numpy."""
    n = len(sentence_words)
    num_tags = len(tag2idx)
    
    # Precompute word indices
    word_indices = [word2idx.get(word.lower(), -1) for word in sentence_words]
    
    # Initialize DP tables
    viterbi = np.zeros((n, num_tags)) + initial
    backpointers = np.zeros((n, num_tags), dtype=int)
    
    # First word
    if word_indices[0] != -1:
        viterbi[0] += emission[:, word_indices[0]]
    
    # Iterate
    for t in range(1, n):
        emit = emission[:, word_indices[t]] if word_indices[t] != -1 else 0
        scores = viterbi[t-1][:, None] + transition + emit
        viterbi[t] = np.max(scores, axis=0)
        backpointers[t] = np.argmax(scores, axis=0)
    
    # Backtrack
    best_path = [np.argmax(viterbi[-1])]
    for t in reversed(range(1, n)):
        best_path.insert(0, backpointers[t, best_path[0]])
    
    return best_path


In [None]:
def get_top_probabilities(matrix, idx2tag, top_k=5, is_transition=False):
    """Extract top probabilities from numpy matrix."""
    results = {}
    for i in range(matrix.shape[0]):
        tag = idx2tag[i]
        probs = matrix[i]
        top_indices = np.argsort(probs)[-top_k:][::-1]
        results[tag] = [
            (idx2tag[j] if is_transition else j, np.exp(probs[j]))  # Fixed parenthesis
            for j in top_indices
        ]
    return results

def print_tag_probabilities(initial_probs, transition_probs, emission_probs, 
                           idx2tag, word2idx, top_k=5):
    """Print probabilities in readable format."""
    # Reverse word index for lookup
    idx2word = {v: k for k, v in word2idx.items()}
    
    print("\n=== Initial Probabilities ===")
    initial_exp = np.exp(initial_probs)
    for tag_idx in np.argsort(initial_exp)[::-1][:top_k]:
        print(f"{idx2tag[tag_idx]}: {initial_exp[tag_idx]:.4f}")
    
    print("\n=== Top Transition Probabilities ===")
    trans_top = get_top_probabilities(np.exp(transition_probs), idx2tag, top_k, True)
    for tag, probs in list(trans_top.items())[:3]:  # Print first 3 tags for brevity
        print(f"From {tag}:")
        for target_tag, prob in probs:
            print(f"  → {target_tag}: {prob:.4f}")
    
    print("\n=== Top Emission Probabilities ===")
    emit_top = get_top_probabilities(np.exp(emission_probs), idx2tag, top_k)
    for tag, probs in list(emit_top.items())[:3]:  # Print first 3 tags
        print(f"Tag {tag}:")
        for word_idx, prob in probs:
            word = idx2word.get(word_idx, "UNK")
            print(f"  - {word}: {prob:.4f}")



36-Tag Model Probabilities

=== Initial Probabilities ===
DT: 0.2448
NNP: 0.1981
IN: 0.1363
PRP: 0.0861
CC: 0.0536

=== Top Transition Probabilities ===
From IN:
  → DT: 1.3763
  → NNP: 1.1591
  → NN: 1.1171
  → JJ: 1.1039
  → CD: 1.0939
From UH:
  → IN: 1.0500
  → PRP: 1.0500
  → VBG: 1.0247
  → -RRB-: 1.0247
  → JJR: 1.0247
From VBZ:
  → VBN: 1.1858
  → DT: 1.1786
  → RB: 1.0983
  → IN: 1.0970
  → NNP: 1.0757

=== Top Emission Probabilities ===
Tag IN:
  - of: 1.0862
  - in: 1.0635
  - for: 1.0315
  - on: 1.0184
  - that: 1.0184
Tag UH:
  - ``no.'': 1.0001
  - ``mindful: 1.0001
  - Viacom's: 1.0001
  - reviewing: 1.0001
  - Force: 1.0001
Tag VBZ:
  - is: 1.0323
  - has: 1.0164
  - says: 1.0080
  - isn't: 1.0022
  - says.: 1.0019

4-Tag Model Probabilities

=== Initial Probabilities ===
O: 0.5764
N: 0.2957
A: 0.0986
V: 0.0293

=== Top Transition Probabilities ===
From N:
  → O: 1.5032
  → N: 1.4347
  → V: 1.1957
  → A: 1.0541
From O:
  → N: 1.4569
  → O: 1.3708
  → V: 1.1768
  → A: 1

In [29]:
sentences = load_data("penn-data.json")
train_36, test_36 = split_data(sentences)

# Preprocess for 4-tag
train_4 = preprocess_4tag(train_36)
test_4 = preprocess_4tag(test_36)

# ------------------- 36-Tag Configuration ------------------- #
print("Training 36-tag model...")
tag2idx_36, word2idx_36, idx2tag_36, _, _ = create_mappings(train_36)
initial_36, trans_36, emit_36 = train_model(train_36, tag2idx_36, word2idx_36)

# Predict
preds_36 = []
for words, _ in test_36:
    path = viterbi(words, initial_36, trans_36, emit_36, tag2idx_36, word2idx_36)
    preds_36.append([idx2tag_36[idx] for idx in path])

# Evaluate
correct_36 = sum(1 for (_, tags), pred in zip(test_36, preds_36) for t, p in zip(tags, pred) if t == p)
total_36 = sum(len(tags) for (_, tags) in test_36)
print(f"36-Tag Accuracy: {correct_36 / total_36:.4f}")

# ------------------- 4-Tag Configuration ------------------- #
print("\nTraining 4-tag model...")
tag2idx_4, word2idx_4, idx2tag_4, _, _ = create_mappings(train_4)
initial_4, trans_4, emit_4 = train_model(train_4, tag2idx_4, word2idx_4)

# Predict
preds_4 = []
for words, _ in test_4:
    path = viterbi(words, initial_4, trans_4, emit_4, tag2idx_4, word2idx_4)
    preds_4.append([idx2tag_4[idx] for idx in path])

# Evaluate
correct_4 = sum(1 for (_, tags), pred in zip(test_4, preds_4) for t, p in zip(tags, pred) if t == p)
total_4 = sum(len(tags) for (_, tags) in test_4)
print(f"4-Tag Accuracy: {correct_4 / total_4:.4f}")


Training 36-tag model...
36-Tag Accuracy: 0.7286

Training 4-tag model...
4-Tag Accuracy: 0.8818


In [32]:
print("\n" + "="*40)
print("36-Tag Model Probabilities")
print("="*40)
print_tag_probabilities(
    initial_36, 
    trans_36, 
    emit_36, 
    idx2tag_36, 
    word2idx_36
)

# ------------------- 4-Tag Probabilities ------------------- #
print("\n" + "="*40)
print("4-Tag Model Probabilities")
print("="*40)
print_tag_probabilities(
    initial_4, 
    trans_4, 
    emit_4, 
    idx2tag_4, 
    word2idx_4
)


36-Tag Model Probabilities

=== Initial Probabilities ===
DT: 0.2448
NNP: 0.1981
IN: 0.1363
PRP: 0.0861
CC: 0.0536

=== Top Transition Probabilities ===
From IN:
  → DT: 1.3763
  → NNP: 1.1591
  → NN: 1.1171
  → JJ: 1.1039
  → CD: 1.0939
From UH:
  → IN: 1.0500
  → PRP: 1.0500
  → VBG: 1.0247
  → -RRB-: 1.0247
  → JJR: 1.0247
From VBZ:
  → VBN: 1.1858
  → DT: 1.1786
  → RB: 1.0983
  → IN: 1.0970
  → NNP: 1.0757

=== Top Emission Probabilities ===
Tag IN:
  - of: 1.0862
  - in: 1.0635
  - for: 1.0315
  - on: 1.0184
  - that: 1.0184
Tag UH:
  - ``no.'': 1.0001
  - ``mindful: 1.0001
  - Viacom's: 1.0001
  - reviewing: 1.0001
  - Force: 1.0001
Tag VBZ:
  - is: 1.0323
  - has: 1.0164
  - says: 1.0080
  - isn't: 1.0022
  - says.: 1.0019

4-Tag Model Probabilities

=== Initial Probabilities ===
O: 0.5764
N: 0.2957
A: 0.0986
V: 0.0293

=== Top Transition Probabilities ===
From N:
  → O: 1.5032
  → N: 1.4347
  → V: 1.1957
  → A: 1.0541
From O:
  → N: 1.4569
  → O: 1.3708
  → V: 1.1768
  → A: 1