In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df = pd.read_csv('NER dataset.csv', encoding = 'ISO-8859-1')

In [5]:
df.drop(['Sentence #'], axis=1, inplace=True)

In [7]:
df

Unnamed: 0,Word,POS,Tag
0,Thousands,NNS,O
1,of,IN,O
2,demonstrators,NNS,O
3,have,VBP,O
4,marched,VBN,O
...,...,...,...
1048570,they,PRP,O
1048571,responded,VBD,O
1048572,to,TO,O
1048573,the,DT,O


In [8]:
from collections import defaultdict

def train_hmm(tagged_data):
    # Initialize matrices
    emission_matrix = defaultdict(lambda: defaultdict(int))
    transition_matrix = defaultdict(lambda: defaultdict(int))
    
    # Count occurrences of word-tag pairs and transitions
    for sentence in tagged_data:
        prev_tag = '<s>'  # special tag for the start of a sentence
        for word, tag in sentence:
            emission_matrix[tag][word] += 1
            transition_matrix[prev_tag][tag] += 1
            prev_tag = tag
    
    # Normalize matrices to get probabilities
    for tag in emission_matrix:
        total_emission = sum(emission_matrix[tag].values())
        for word in emission_matrix[tag]:
            emission_matrix[tag][word] /= total_emission
    
    for prev_tag in transition_matrix:
        total_transition = sum(transition_matrix[prev_tag].values())
        for tag in transition_matrix[prev_tag]:
            transition_matrix[prev_tag][tag] /= total_transition
    
    return emission_matrix, transition_matrix

def print_matrix(matrix, matrix_name):
    print(f"\n{matrix_name} Matrix:")
    for row in matrix:
        print(row, end='\t')
        for col in matrix[row]:
            print(f"{col}:{matrix[row][col]:.4f}", end='\t')
        print()

def viterbi(sentence, emission_matrix, transition_matrix):
    # Initialize Viterbi matrix
    viterbi_matrix = [defaultdict(float) for _ in range(len(sentence))]
    
    # Initialize backpointer matrix
    backpointer_matrix = [defaultdict(int) for _ in range(len(sentence))]
    
    # Initialize the first column of the Viterbi matrix
    for tag in emission_matrix:
        viterbi_matrix[0][tag] = transition_matrix['<s>'][tag] * emission_matrix[tag][sentence[0]]
    
    # Fill in the rest of the Viterbi matrix
    for t in range(1, len(sentence)):
        for tag in emission_matrix:
            max_prob = 0
            max_prev_tag = None
            for prev_tag in emission_matrix:
                prob = viterbi_matrix[t-1][prev_tag] * transition_matrix[prev_tag][tag] * emission_matrix[tag][sentence[t]]
                if prob > max_prob:
                    max_prob = prob
                    max_prev_tag = prev_tag
            viterbi_matrix[t][tag] = max_prob
            backpointer_matrix[t][tag] = max_prev_tag
    
    # Find the tag for the last word
    max_last_prob = 0
    max_last_tag = None
    for tag in emission_matrix:
        prob = viterbi_matrix[-1][tag]
        if prob > max_last_prob:
            max_last_prob = prob
            max_last_tag = tag
    
    # Backtrace to find the optimal path
    pos_tags = [max_last_tag]
    for t in range(len(sentence) - 1, 0, -1):
        pos_tags.insert(0, backpointer_matrix[t][pos_tags[0]])
    
    return pos_tags

# Example usage
tagged_dataset = [
    [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')],
    [('A', 'DT'), ('bird', 'NN'), ('is', 'VBZ'), ('singing', 'VBG')],
]

emission_matrix, transition_matrix = train_hmm(tagged_dataset)

print_matrix(emission_matrix, "Emission")
print_matrix(transition_matrix, "Transition")

# POS tagging for a new sentence
new_sentence = ['The', 'dog', 'is', 'barking']
predicted_tags = viterbi(new_sentence, emission_matrix, transition_matrix)

print("\nPredicted POS tags for the new sentence:")
for word, tag in zip(new_sentence, predicted_tags):
    print(f"{word}/{tag}", end=' ')



Emission Matrix:
DT	The:0.3333	the:0.3333	A:0.3333	
NN	cat:0.3333	mat:0.3333	bird:0.3333	
VBZ	is:1.0000	
IN	on:1.0000	
VBG	singing:1.0000	

Transition Matrix:
<s>	DT:1.0000	
DT	NN:1.0000	
NN	VBZ:1.0000	
VBZ	IN:0.5000	VBG:0.5000	
IN	DT:1.0000	

Predicted POS tags for the new sentence:
The/0 dog/0 is/0 barking/None 