In [11]:
import numpy as np
from collections import defaultdict
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import sys
import math

# --- 1. Data Parsing ---

def parse_sentences(file_path):
    """
    Parses the tagged file into a list of sentences.
    Each sentence is a list of (word, tag) tuples.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            sentence = []
            pairs = line.split(' ')
            for pair in pairs:
                # Find the last underscore to split word and tag
                split_index = pair.rfind('_')
                if split_index == -1:
                    continue # Skip if format is incorrect
                
                word = pair[:split_index]
                tag = pair[split_index+1:]
                sentence.append((word, tag))
            
            if sentence:
                sentences.append(sentence)
    return sentences

# --- 2. HMM Model Training (Task b) ---

def train_hmm(train_sentences, smoothing_k=1):
    """
    Calculates transition and emission probabilities from the training data.
    Prints samples of the learned probabilities.
    """
    
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)
    all_tags = set()
    vocab = set()

    # --- Count occurrences ---
    for sentence in train_sentences:
        prev_tag = "START" 
        
        for i, (word, tag) in enumerate(sentence):
            vocab.add(word)
            all_tags.add(tag)
            tag_counts[tag] += 1
            transition_counts[prev_tag][tag] += 1
            emission_counts[tag][word] += 1
            prev_tag = tag

    # --- Calculate Log Probabilities ---
    
    num_tags = len(all_tags)
    vocab_size = len(vocab)
    
    # -- Transition Probabilities --
    transition_probs = defaultdict(lambda: defaultdict(float))
    
    # Create a list to store readable probabilities for display
    display_transitions = []

    # Handle START tag
    start_tag_total = sum(transition_counts["START"].values())
    for tag in all_tags:
        count = transition_counts["START"][tag]
        prob = (count + smoothing_k) / (start_tag_total + smoothing_k * num_tags)
        transition_probs["START"][tag] = np.log(prob)
        if count > 0: # Only store non-zero transitions for display to keep it interesting
            display_transitions.append((f"START -> {tag}", prob))
        
    # Handle other tags
    for prev_tag in all_tags:
        total_transitions_from_prev = sum(transition_counts[prev_tag].values())
        for tag in all_tags:
            count = transition_counts[prev_tag][tag]
            prob = (count + smoothing_k) / (total_transitions_from_prev + smoothing_k * num_tags)
            transition_probs[prev_tag][tag] = np.log(prob)
            if count > 0:
                display_transitions.append((f"{prev_tag} -> {tag}", prob))

    # -- Emission Probabilities --
    emission_probs = defaultdict(lambda: defaultdict(float))
    display_emissions = []
    
    # Unknown word probability
    unknown_log_prob = {}
    for tag in all_tags:
        prob = smoothing_k / (tag_counts[tag] + smoothing_k * (vocab_size + 1))
        unknown_log_prob[tag] = np.log(prob)
        
    for tag in all_tags:
        total_emissions_from_tag = tag_counts[tag]
        for word in vocab:
            count = emission_counts[tag][word]
            prob = (count + smoothing_k) / (total_emissions_from_tag + smoothing_k * (vocab_size + 1))
            emission_probs[tag][word] = np.log(prob)
            
            # Store some common words for display
            if count > 5: 
                display_emissions.append((f"P('{word}'|{tag})", prob))
            
        emission_probs[tag]["<UNK>"] = unknown_log_prob[tag]

    # --- DISPLAY SECTION ---
    print("\n" + "="*40)
    print(" MODEL PROBABILITIES (Sample)")
    print("="*40)
    
    print("\n[Top 10 Transition Probabilities (Tag -> Tag)]")
    # Sort by probability descending
    display_transitions.sort(key=lambda x: x[1], reverse=True)
    for trans, prob in display_transitions[:10]:
        print(f"{trans:<20} : {prob:.6f}")

    print("\n[Sample 10 Emission Probabilities (Word | Tag)]")
    # Sort by probability descending
    display_emissions.sort(key=lambda x: x[1], reverse=True)
    for emit, prob in display_emissions[:10]:
        print(f"{emit:<20} : {prob:.6f}")
    print("="*40 + "\n")

    return transition_probs, emission_probs, all_tags

# --- 3. Viterbi Algorithm (Task c) ---

def viterbi_decode(sentence_words, all_tags, transition_probs, emission_probs):
    """
    Predicts the most likely tag sequence using Viterbi.
    """
    num_tags = len(all_tags)
    num_words = len(sentence_words)
    tags_list = sorted(list(all_tags)) 
    
    viterbi = np.full((num_tags, num_words), -np.inf)
    backpointers = np.zeros((num_tags, num_words), dtype=int)
    
    # Initialization
    for i, tag in enumerate(tags_list):
        word = sentence_words[0]
        if word not in emission_probs[tag]:
            word = "<UNK>"
        
        log_emission = emission_probs[tag][word]
        log_transition = transition_probs["START"][tag]
        viterbi[i, 0] = log_transition + log_emission

    # Recursion
    for t in range(1, num_words):
        word = sentence_words[t]
        for j, current_tag in enumerate(tags_list):
            if word not in emission_probs[current_tag]:
                word_to_check = "<UNK>"
            else:
                word_to_check = word
            
            log_emission = emission_probs[current_tag][word_to_check]
            max_prob = -np.inf
            best_prev_tag_index = 0
            
            for i, prev_tag in enumerate(tags_list):
                log_transition = transition_probs[prev_tag][current_tag]
                current_prob = viterbi[i, t-1] + log_transition + log_emission
                
                if current_prob > max_prob:
                    max_prob = current_prob
                    best_prev_tag_index = i
            
            viterbi[j, t] = max_prob
            backpointers[j, t] = best_prev_tag_index

    # Termination & Backtracking
    best_last_tag_index = np.argmax(viterbi[:, -1])
    best_path = [tags_list[best_last_tag_index]]
    current_tag_index = best_last_tag_index
    
    for t in range(num_words - 1, 0, -1):
        prev_tag_index = backpointers[current_tag_index, t]
        best_path.insert(0, tags_list[prev_tag_index])
        current_tag_index = prev_tag_index
        
    return best_path

# --- 4. Main Execution ---

def main():
    FILE_PATH = "wsj_pos_tagged_en.txt"
    K_FOLDS = 5 
    
    print(f"Loading data from {FILE_PATH}...")
    try:
        sentences = parse_sentences(FILE_PATH)
    except FileNotFoundError:
        print(f"Error: File not found at '{FILE_PATH}'")
        return
        
    if not sentences:
        print("Error: No sentences parsed.")
        return

    print(f"Found {len(sentences)} sentences.")
    print(f"Starting {K_FOLDS}-Fold Cross-Validation...\n")

    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    sentences_array = np.array(sentences, dtype=object)
    
    fold_reports = []
    
    for fold, (train_index, test_index) in enumerate(kf.split(sentences_array)):
        print(f"--- Fold {fold+1}/{K_FOLDS} ---")
        
        train_sentences = sentences_array[train_index]
        test_sentences = sentences_array[test_index]
        
        # Train and Display Probabilities
        transition_probs, emission_probs, all_tags = train_hmm(train_sentences)
        
        print("Predicting tags on test set...")
        all_true_tags = []
        all_pred_tags = []
        
        for sentence in test_sentences:
            words = [word for word, tag in sentence]
            true_tags = [tag for word, tag in sentence]
            
            pred_tags = viterbi_decode(words, all_tags, transition_probs, emission_probs)
            
            all_true_tags.extend(true_tags)
            all_pred_tags.extend(pred_tags)

        # Evaluate
        report = classification_report(all_true_tags, all_pred_tags, zero_division=0)
        print(f"Results for Fold {fold+1}:")
        print(report)
        fold_reports.append(classification_report(all_true_tags, all_pred_tags, output_dict=True, zero_division=0))

    # Final Averages
    print("\n--- Average Performance Across All Folds ---")
    avg_accuracy = np.mean([r['accuracy'] for r in fold_reports])
    print(f"Average Accuracy:  {avg_accuracy:.4f}")

if __name__ == "__main__":
    main()

Loading data from wsj_pos_tagged_en.txt...
Found 3914 sentences.
Starting 5-Fold Cross-Validation...

--- Fold 1/5 ---

 MODEL PROBABILITIES (Sample)

[Top 10 Transition Probabilities (Tag -> Tag)]
$ -> CD              : 0.920312
MD -> VB             : 0.762516
. -> ''              : 0.709302
TO -> VB             : 0.568917
DT -> NN             : 0.466272
JJ -> NN             : 0.446996
PRP$ -> NN           : 0.404545
POS -> NN            : 0.384505
NNP -> NNP           : 0.383098
EX -> VBZ            : 0.370690

[Sample 10 Emission Probabilities (Word | Tag)]
P(','|,)             : 0.269457
P('.'|.)             : 0.222174
P('the'|DT)          : 0.187954
P('to'|TO)           : 0.137630
P('of'|IN)           : 0.100764
P('and'|CC)          : 0.096017
P('a'|DT)            : 0.085983
P('in'|IN)           : 0.067015
P(''s'|POS)          : 0.052762
P('$'|$)             : 0.052585

Predicting tags on test set...
Results for Fold 1:
              precision    recall  f1-score   support

      