### Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import model_selection

### Load Dataset & Pre Processing

In [3]:
def process_tag(tag):
    tag = [t.split("-")[1] if t != 'O' else t for t in tag]
    tag = ["<s>"] + tag
    return tag

def process_sentence(sentence):
    sentence = [sent.lower() for sent in sentence]
    sentence = ["<s>"] + sentence # Add Start Token
    return sentence

def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tags = df.groupby("Sentence #")["Tag"].apply(list).values
    
    sentences = [process_sentence(sentence) for sentence in sentences]
    tags = [process_tag(tag) for tag in tags]
    
    return sentences, tags

data_path = r"ner_dataset.csv"
sentences, tags = process_data(data_path)

## Hidden Markov Model

In [4]:
# No of Tags
# <s> - Start Token

tag = set()
for t in tags:
    tag.update(set(t))
    
print("Number of Tags :", len(tag))
print("Tags :", list(tag))

Number of Tags : 10
Tags : ['O', 'nat', 'eve', 'geo', 'tim', 'art', '<s>', 'gpe', 'per', 'org']


### Transition Matrix

In [5]:
def create_transition_matrix(tags):
    
    # Bigram
    bigram = {}
    for tag in tags:
        for idx in range(len(tag)-1):
            b_tuple = (tag[idx], tag[idx+1])
            if(bigram.get(b_tuple, -1) == -1):
                bigram[b_tuple] = 1
            else:
                bigram[b_tuple] += 1
              
    # Tags    
    tag = set()
    for t in tags:
        tag.update(set(t))
    no_tag = len(tag)
    tag = list(tag)
    
    # Transition Matrix
    transition_matrix = pd.DataFrame(np.zeros((no_tag, no_tag)), index=tag, columns=tag)
    
    # Populate Transition Matrix
    for tag_first in tag:
        for tag_second in tag:
            transition_matrix[tag_first][tag_second] = bigram.get((tag_first, tag_second), 0)
            
    transition_matrix = transition_matrix / transition_matrix.sum(axis=0)
    
    return transition_matrix
    
transition_matrix = create_transition_matrix(tags)
transition_matrix.T

Unnamed: 0,O,nat,eve,geo,tim,art,<s>,gpe,per,org
O,0.889694,0.000223,0.000324,0.040195,0.021812,0.000437,0.0,0.014839,0.012882,0.019595
nat,0.781746,0.202381,0.0,0.003968,0.003968,0.0,0.0,0.0,0.007937,0.0
eve,0.5375,0.0,0.451786,0.0,0.010714,0.0,0.0,0.0,0.0,0.0
geo,0.807015,0.0,2.2e-05,0.164686,0.020525,8.9e-05,0.0,0.003243,0.002888,0.001533
tim,0.745251,3.7e-05,0.000819,0.004655,0.243128,0.000149,0.0,0.002123,0.001899,0.001937
art,0.548711,0.0,0.0,0.002865,0.012894,0.425501,0.0,0.0,0.005731,0.004298
<s>,0.715403,0.000229,0.000209,0.069539,0.010738,0.000375,0.0,0.062324,0.083801,0.057382
gpe,0.86904,0.0,6.2e-05,0.008465,0.00193,6.2e-05,0.0,0.012324,0.081601,0.026516
per,0.470184,5.8e-05,0.0,0.006895,0.005843,0.0,0.0,0.00187,0.504017,0.011132
org,0.513422,0.0,5.4e-05,0.001192,0.008776,0.000217,0.0,0.004036,0.017661,0.454641


### Emission Matrix

In [6]:
def create_emission_matrix(sentences, tags):
                
    # Vocab
    vocab = set()
    for s in sentences:
        vocab.update(set(s))
    no_vocab = len(vocab)
    vocab = list(vocab)
    
    # Tags
    tag = set()
    for t in tags:
        tag.update(set(t))
    tag = list(tag)
    tag.remove("<s>")
    no_tag = len(tag)
    
    # Emission Matrix
    emission_matrix = pd.DataFrame(np.zeros((no_vocab, no_tag)), index=vocab, columns=tag)
    
    # Populate Transition Matrix
    no = len(sentences)
    pair = {}
    
    for i in range(no):
        for idx in range(len(sentences[i])):
            t = tags[i][idx] 
            v = sentences[i][idx]
            if(pair.get((t, v), -1) == -1):
                pair[(t, v)] = 1
            else:
                pair[(t, v)] += 1
    
    for (t, v), val in pair.items():
        if(t == "<s>"):
            continue
        emission_matrix[t][v] = val
    
    emission_matrix = emission_matrix / emission_matrix.sum(axis=0)
    
    return emission_matrix

emission_matrix = create_emission_matrix(sentences, tags)
emission_matrix.T

Unnamed: 0,fraught,stubbornly,gubernatorial,pro-secular,yake,mules,excuse,guarantee,6.1,branco,...,10-person,kofoworola,hangout,mistakenly,adi,qazi,thanh,bleeding,needs,frelimo
O,1e-06,2e-06,3e-06,2e-06,0.0,2e-06,1e-05,1.2e-05,6e-06,0.0,...,1e-06,0.0,1e-06,1.5e-05,0.0,0.0,0.0,8e-06,0.000104,0.0
nat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eve,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
geo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.2e-05,0.0,0.0,4.4e-05,0.0,4.4e-05,0.0,0.0,0.0
tim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
art,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gpe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
per,0.0,0.0,0.0,0.0,2.9e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000175,0.0,0.0,0.0,0.0
org,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.1e-05


### Viterbi Algorithm

In [7]:
def viterbo_algorithm(transition_matrix, emission_matrix, sent):
    
    start_tok = "<s>"
    tags = list(emission_matrix.columns)

    matrix = pd.DataFrame(np.zeros((len(tags), len(sent))), index=tags)
    trace_back = pd.DataFrame(np.full((len(tags), len(sent)), -1), index = tags)
    
    result = [0] * len(sent)
    
    # Initial Prob
    for t in tags:
        word = sent[0]
        matrix[0][t] = transition_matrix[start_tok][t] * emission_matrix[t][word]
    
    # Forward Pass 
    for idx in range(1, len(sent)):
        prev_idx = idx - 1
        word = sent[idx]
        
        for tag in tags:
            possible_path = []
            for t in tags:
                possible_path.append(matrix[prev_idx][t] * transition_matrix[t][tag] * emission_matrix[tag][word])
            matrix[idx][tag] = max(possible_path)
            trace_back[idx][tag] = np.argmax(possible_path)
    
    # Backward Pass
    idx = len(sent) - 1
    last_idx = np.argmax(matrix[idx])
    
    while(last_idx != -1):
        result[idx] = tags[last_idx]
        last_idx = trace_back[idx][tags[last_idx]]
        idx -= 1
        
    return result

In [8]:
viterbo_algorithm(transition_matrix, emission_matrix, ["london", "is", "famous", "for", "monday"])

['geo', 'O', 'O', 'O', 'tim']

In [9]:
import nltk

# Define the text to be analyzed
text = "GeeksforGeeks is a recognised platform for online learning in India"

# Tokenize the text into words
tokens = nltk.word_tokenize(text)

# Apply part-of-speech tagging to the tokens
tagged = nltk.pos_tag(tokens)

# Apply named entity recognition to the tagged words
entities = nltk.chunk.ne_chunk(tagged)

# Print the entities found in the text
for entity in entities:
	if hasattr(entity, 'label') and entity.label() == 'ORGANIZATION':
		print(entity.label(),'-->', ''.join(c[0] for c in entity))
	elif hasattr(entity, 'label') and entity.label() == 'GPE':
		print(entity.label(), '-->',''.join(c[0] for c in entity))

ORGANIZATION --> GeeksforGeeks
GPE --> India


In [10]:
entities

ModuleNotFoundError: No module named 'svgling'

Tree('S', [Tree('ORGANIZATION', [('GeeksforGeeks', 'NNP')]), ('is', 'VBZ'), ('a', 'DT'), ('recognised', 'JJ'), ('platform', 'NN'), ('for', 'IN'), ('online', 'NN'), ('learning', 'NN'), ('in', 'IN'), Tree('GPE', [('India', 'NNP')])])