# New Section

In [19]:

import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from tqdm import tqdm

file_path = "/Assignment9.csv"
print("Loading dataset...")
df = pd.read_csv(file_path)
print("Dataset loaded successfully.")

sentence_starts = df[df["Sentence #"].notna()].index
train_cutoff = sentence_starts[47700] if len(sentence_starts) > 47700 else len(df)
test_start = sentence_starts[-259] if len(sentence_starts) >= 259 else 0

train_df = df.iloc[:train_cutoff]
test_df = df.iloc[test_start:]
print(f"Training set: {len(train_df)} rows")
print(f"Testing set: {len(test_df)} rows")

states = train_df["NE Tag"].unique()
observations = train_df["POS"].unique()
print(f"State space (NE Tags): {len(states)} unique tags")
print(f"Observation space (POS Tags): {len(observations)} unique tags")

first_tags = train_df.loc[train_df["Sentence #"].notna(), "NE Tag"]
initial_counts = Counter(first_tags)
total_sentences = len(first_tags)
Pi = {tag: count / total_sentences for tag, count in initial_counts.items()}
print("Initial state distribution (Pi) computed.")

pd.DataFrame(list(Pi.items()), columns=["State", "Probability"]).to_csv("Pi.csv", index=False)

transition_counts = defaultdict(lambda: Counter())
prev_tag = None
print("Computing transition probabilities...")
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Transition Probabilities"):
    if pd.notna(row["Sentence #"]):
        prev_tag = None
    if prev_tag is not None:
        transition_counts[prev_tag][row["NE Tag"]] += 1
    prev_tag = row["NE Tag"]

total_transitions = {tag: sum(transition_counts[tag].values()) for tag in states}
A = {tag: {next_tag: count / total_transitions[tag] for next_tag, count in transition_counts[tag].items()} for tag in states if total_transitions[tag] > 0}
print("State transition probabilities (A) computed.")

pd.DataFrame([(s1, s2, prob) for s1, trans in A.items() for s2, prob in trans.items()],
             columns=["From_State", "To_State", "Probability"]).to_csv("A.csv", index=False)

print("Computing observation probabilities...")
emission_counts = defaultdict(lambda: Counter())
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Observation Probabilities"):
    emission_counts[row["NE Tag"]][row["POS"]] += 1

total_emissions = {tag: sum(emission_counts[tag].values()) for tag in states}
B = {tag: {obs: count / total_emissions[tag] for obs, count in emission_counts[tag].items()} for tag in states if total_emissions[tag] > 0}
print("State observation probabilities (B) computed.")

pd.DataFrame([(s, o, prob) for s, obs in B.items() for o, prob in obs.items()],
             columns=["State", "Observation", "Probability"]).to_csv("B.csv", index=False)

def forward_algorithm(test_sentence):
    alpha = [{}]

    for state in states:
        alpha[0][state] = Pi.get(state, 1e-6) * B.get(state, {}).get(test_sentence[0], 0.001)

    for t in range(1, len(test_sentence)):
        alpha.append({})
        for state in states:
            alpha[t][state] = sum(
                alpha[t-1][prev] * A.get(prev, {}).get(state, 1e-6) * B.get(state, {}).get(test_sentence[t], 0.01)
                for prev in states
            )

    return sum(alpha[-1].values())

print("Evaluating test sentences...")
test_sentences = test_df.groupby("Sentence #")["POS"].apply(list).tolist()
test_probabilities = [forward_algorithm(sentence) for sentence in tqdm(test_sentences, desc="Processing")]

print("Saving test sentence probabilities...")
pd.DataFrame({"Sentence #": range(1, len(test_probabilities) + 1), "Probability": test_probabilities}).to_csv("Test_Probabilities.csv", index=False)

print("\nTest Sentence Probabilities:")
for i, prob in enumerate(test_probabilities):
    print(f"Sentence {i+1}: Probability = {prob:.6f}")


Loading dataset...
Dataset loaded successfully.
Training set: 1042948 rows
Testing set: 5627 rows
State space (NE Tags): 17 unique tags
Observation space (POS Tags): 42 unique tags
Initial state distribution (Pi) computed.
Computing transition probabilities...


Transition Probabilities: 100%|██████████| 1042948/1042948 [00:56<00:00, 18573.08it/s]


State transition probabilities (A) computed.
Computing observation probabilities...


Observation Probabilities: 100%|██████████| 1042948/1042948 [00:52<00:00, 19854.28it/s]


State observation probabilities (B) computed.
Evaluating test sentences...


Processing: 100%|██████████| 259/259 [00:00<00:00, 57075.85it/s]

Saving test sentence probabilities...

Test Sentence Probabilities:
Sentence 1: Probability = 0.080196
Sentence 2: Probability = 0.080196
Sentence 3: Probability = 0.080196
Sentence 4: Probability = 0.117401
Sentence 5: Probability = 0.221256
Sentence 6: Probability = 0.007236
Sentence 7: Probability = 0.221256
Sentence 8: Probability = 0.105228
Sentence 9: Probability = 0.221256
Sentence 10: Probability = 0.221256
Sentence 11: Probability = 0.011014
Sentence 12: Probability = 0.011014
Sentence 13: Probability = 0.080196
Sentence 14: Probability = 0.221256
Sentence 15: Probability = 0.080196
Sentence 16: Probability = 0.117401
Sentence 17: Probability = 0.221256
Sentence 18: Probability = 0.011014
Sentence 19: Probability = 0.080196
Sentence 20: Probability = 0.016561
Sentence 21: Probability = 0.065535
Sentence 22: Probability = 0.065535
Sentence 23: Probability = 0.080196
Sentence 24: Probability = 0.018787
Sentence 25: Probability = 0.221256
Sentence 26: Probability = 0.105228
Sente


