In [13]:
import numpy as np
import pandas as pd  # Added pandas for CSV handling
from collections import defaultdict
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import sys

# --- 1. Data Parsing ---

def parse_sentences(file_path):
    """
    Parses the tagged file into a list of sentences.
    Each sentence is a list of (word, tag) tuples.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            sentence = []
            pairs = line.split(' ')
            for pair in pairs:
                # Find the last underscore to split word and tag
                split_index = pair.rfind('_')
                if split_index == -1:
                    continue # Skip if format is incorrect
                
                word = pair[:split_index]
                tag = pair[split_index+1:]
                sentence.append((word, tag))
            
            if sentence:
                sentences.append(sentence)
    return sentences


# --- 2. HMM Model Training (Task b) ---

def train_hmm(train_sentences, smoothing_k=1):
    """
    Calculates transition and emission probabilities from the training data.
    
    Returns:
    - transition_probs: logP(tag_i | tag_i-1)
    - emission_probs: logP(word | tag)
    - all_tags: A set of all unique tags
    """
    
    transition_counts = defaultdict(lambda: defaultdict(int))
    emission_counts = defaultdict(lambda: defaultdict(int))
    tag_counts = defaultdict(int)
    all_tags = set()
    vocab = set()

    # --- Count occurrences ---
    for sentence in train_sentences:
        prev_tag = "START" # Special tag for the beginning of a sentence
        
        for i, (word, tag) in enumerate(sentence):
            # Add to vocab and tag set
            vocab.add(word)
            all_tags.add(tag)
            
            # Count tag
            tag_counts[tag] += 1
            
            # Count transition (prev_tag -> tag)
            transition_counts[prev_tag][tag] += 1
            
            # Count emission (tag -> word)
            emission_counts[tag][word] += 1
            
            prev_tag = tag

    # --- Calculate Log Probabilities with Add-k Smoothing ---
    
    num_tags = len(all_tags)
    vocab_size = len(vocab)
    
    # -- Transition Probabilities: logP(tag | prev_tag) --
    transition_probs = defaultdict(lambda: defaultdict(float))
    
    # Handle START tag separately
    start_tag_total = sum(transition_counts["START"].values())
    for tag in all_tags:
        count = transition_counts["START"][tag]
        prob = (count + smoothing_k) / (start_tag_total + smoothing_k * num_tags)
        transition_probs["START"][tag] = np.log(prob)
        
    # Handle all other tags
    for prev_tag in all_tags:
        total_transitions_from_prev = sum(transition_counts[prev_tag].values())
        
        for tag in all_tags:
            count = transition_counts[prev_tag][tag]
            prob = (count + smoothing_k) / (total_transitions_from_prev + smoothing_k * num_tags)
            transition_probs[prev_tag][tag] = np.log(prob)

    # -- Emission Probabilities: logP(word | tag) --
    emission_probs = defaultdict(lambda: defaultdict(float))
    
    # Calculate log probability for unknown words
    unknown_log_prob = {}
    for tag in all_tags:
        # Prob for <UNK> token
        prob = smoothing_k / (tag_counts[tag] + smoothing_k * (vocab_size + 1))
        unknown_log_prob[tag] = np.log(prob)
        
    for tag in all_tags:
        total_emissions_from_tag = tag_counts[tag]
        
        for word in vocab:
            count = emission_counts[tag][word]
            prob = (count + smoothing_k) / (total_emissions_from_tag + smoothing_k * (vocab_size + 1))
            emission_probs[tag][word] = np.log(prob)
            
        # Add the <UNK> probability for this tag
        emission_probs[tag]["<UNK>"] = unknown_log_prob[tag]

    return transition_probs, emission_probs, all_tags


# --- 3. Helper Function to Save to CSV ---

def save_model_to_csv(transition_probs, emission_probs, fold_num):
    """
    Converts probability dictionaries to DataFrames, converts log-probs to 
    normal probabilities, and saves them as CSVs.
    """
    print(f"  -> Saving probability tables for Fold {fold_num}...")

    # --- Save Transition Probabilities ---
    # Convert nested dict to DataFrame (Rows=Prev Tag, Cols=Next Tag)
    df_trans = pd.DataFrame(transition_probs).T 
    # Convert Log Probs back to Normal Probs (0 to 1) and fill NaNs
    df_trans = df_trans.map(np.exp).fillna(0) 
    
    trans_filename = f"transitions_fold_{fold_num}.csv"
    df_trans.to_csv(trans_filename)

    # --- Save Emission Probabilities ---
    # Convert nested dict to DataFrame (Rows=Tag, Cols=Word)
    df_emit = pd.DataFrame(emission_probs).T
    # Convert Log Probs back to Normal Probs
    df_emit = df_emit.map(np.exp).fillna(0)
    
    emit_filename = f"emissions_fold_{fold_num}.csv"
    df_emit.to_csv(emit_filename)
    print(f"     Saved '{trans_filename}' and '{emit_filename}'")


# --- 4. Viterbi Algorithm (Task c) ---

def viterbi_decode(sentence_words, all_tags, transition_probs, emission_probs):
    """
    Predicts the most likely tag sequence for a given sentence.
    """
    
    num_tags = len(all_tags)
    num_words = len(sentence_words)
    tags_list = sorted(list(all_tags)) 
    
    # Viterbi matrix
    viterbi = np.full((num_tags, num_words), -np.inf) 
    backpointers = np.zeros((num_tags, num_words), dtype=int)
    
    # --- Initialization Step ---
    for i, tag in enumerate(tags_list):
        word = sentence_words[0]
        if word not in emission_probs[tag]:
            word = "<UNK>"
        
        log_emission = emission_probs[tag][word]
        log_transition = transition_probs["START"][tag]
        
        viterbi[i, 0] = log_transition + log_emission

    # --- Recursion Step ---
    for t in range(1, num_words):
        word = sentence_words[t]
        
        for j, current_tag in enumerate(tags_list):
            if word not in emission_probs[current_tag]:
                word_to_check = "<UNK>"
            else:
                word_to_check = word
            
            log_emission = emission_probs[current_tag][word_to_check]

            max_prob = -np.inf
            best_prev_tag_index = 0
            
            for i, prev_tag in enumerate(tags_list):
                log_transition = transition_probs[prev_tag][current_tag]
                current_prob = viterbi[i, t-1] + log_transition + log_emission
                
                if current_prob > max_prob:
                    max_prob = current_prob
                    best_prev_tag_index = i
            
            viterbi[j, t] = max_prob
            backpointers[j, t] = best_prev_tag_index

    # --- Termination Step ---
    best_last_tag_index = np.argmax(viterbi[:, -1])
    
    # --- Backtracking ---
    best_path = [tags_list[best_last_tag_index]]
    current_tag_index = best_last_tag_index
    
    for t in range(num_words - 1, 0, -1):
        prev_tag_index = backpointers[current_tag_index, t]
        best_path.insert(0, tags_list[prev_tag_index])
        current_tag_index = prev_tag_index
        
    return best_path


# --- 5. Main Execution: K-Fold CV and Evaluation (Tasks a & d) ---

def main():
    FILE_PATH = "wsj_pos_tagged_en.txt"
    K_FOLDS = 5 # As per assignment, K >= 3
    
    print(f"Loading and parsing data from {FILE_PATH}...")
    try:
        sentences = parse_sentences(FILE_PATH)
    except FileNotFoundError:
        print(f"Error: File not found at '{FILE_PATH}'")
        print("Please make sure 'wsj_pos_tagged_en.txt' is in the same directory.")
        return # Changed sys.exit to return for notebook safety
        
    if not sentences:
        print("Error: No sentences were parsed. Check file format.")
        return

    print(f"Found {len(sentences)} sentences.")
    print(f"Starting {K_FOLDS}-Fold Cross-Validation...\n")

    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    sentences_array = np.array(sentences, dtype=object) 
    
    fold_reports = []
    
    for fold, (train_index, test_index) in enumerate(kf.split(sentences_array)):
        print(f"--- Fold {fold+1}/{K_FOLDS} ---")
        
        train_sentences = sentences_array[train_index]
        test_sentences = sentences_array[test_index]
        
        # --- Task b: Train the HMM ---
        print("Training HMM (calculating probabilities)...")
        transition_probs, emission_probs, all_tags = train_hmm(train_sentences)
        
        # --- NEW STEP: Save tables to CSV ---
        save_model_to_csv(transition_probs, emission_probs, fold+1)
        # ------------------------------------

        # --- Tasks c & d: Predict and Evaluate ---
        print("Predicting tags on test set...")
        all_true_tags = []
        all_pred_tags = []
        
        for sentence in test_sentences:
            words = [word for word, tag in sentence]
            true_tags = [tag for word, tag in sentence]
            
            # Task c: Viterbi Decode
            pred_tags = viterbi_decode(words, all_tags, transition_probs, emission_probs)
            
            all_true_tags.extend(true_tags)
            all_pred_tags.extend(pred_tags)

        # Task d: Evaluate Performance for this fold
        report = classification_report(all_true_tags, all_pred_tags, zero_division=0)
        print(f"Results for Fold {fold+1}:")
        print(report)
        fold_reports.append(classification_report(all_true_tags, all_pred_tags, output_dict=True, zero_division=0))

    # --- Final Averaged Results ---
    print("\n--- Average Performance Across All Folds ---")
    
    avg_precision = np.mean([r['macro avg']['precision'] for r in fold_reports])
    avg_recall = np.mean([r['macro avg']['recall'] for r in fold_reports])
    avg_f1 = np.mean([r['macro avg']['f1-score'] for r in fold_reports])
    avg_accuracy = np.mean([r['accuracy'] for r in fold_reports])
    
    print(f"Average Accuracy:  {avg_accuracy:.4f}")
    print(f"Average Precision (Macro): {avg_precision:.4f}")
    print(f"Average Recall (Macro):    {avg_recall:.4f}")
    print(f"Average F1-Score (Macro):  {avg_f1:.4f}")


if __name__ == "__main__":
    main()

Loading and parsing data from wsj_pos_tagged_en.txt...
Found 3914 sentences.
Starting 5-Fold Cross-Validation...

--- Fold 1/5 ---
Training HMM (calculating probabilities)...
  -> Saving probability tables for Fold 1...
     Saved 'transitions_fold_1.csv' and 'emissions_fold_1.csv'
Predicting tags on test set...
Results for Fold 1:
              precision    recall  f1-score   support

           #       0.00      0.00      0.00         1
           $       0.76      1.00      0.86       129
          ''       0.75      1.00      0.86       148
           ,       0.95      1.00      0.98       953
       -LRB-       1.00      0.54      0.70        24
       -RRB-       0.95      0.74      0.83        27
           .       0.90      1.00      0.95       782
           :       1.00      0.89      0.94       105
          CC       0.99      0.99      0.99       451
          CD       0.95      0.71      0.81       717
          DT       0.79      0.98      0.88      1627
          EX     