In [1]:
import random
import os
import re

from sklearn.model_selection import GroupShuffleSplit
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from sklearn.metrics import *
from hmmlearn import hmm
from typing import List
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
SEP       = os.path.sep
ROOT_PATH = SEP.join(os.getcwd().split(SEP)[:-2])
DATA_PATH = f'{ROOT_PATH}/Dataset/NameEntity'

In [3]:
df = pd.read_csv(f'{DATA_PATH}/NERdataset.csv', encoding = 'latin1')
df = df.fillna(method  = 'ffill')
df = df.rename(columns = {'Sentence #' : 'sentence'})
df.head()

Unnamed: 0,sentence,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
def pre_processing(text_column):
    
    text_column = text_column.str.lower()
    text_column = text_column.str.replace(r'\d+', 'NUM')
    
    stop_words  = set(stopwords.words('english'))
    text_column = text_column.apply(lambda x: ' '.join([word for word in x.split() 
                                                        if word not in stop_words]))
    return text_column

In [5]:
preprocessed_df = pre_processing(df.Word)
preprocessed_df.head()

  text_column = text_column.str.replace(r'\d+', 'NUM')


0        thousands
1                 
2    demonstrators
3                 
4          marched
Name: Word, dtype: object

In [6]:
df_ = df
df_['Word'] = preprocessed_df

df_ = df_[(df_['Word'] !='') | (df_['Word'].isna())]
df_.head()

Unnamed: 0,sentence,Word,POS,Tag
0,Sentence: 1,thousands,NNS,O
2,Sentence: 1,demonstrators,NNS,O
4,Sentence: 1,marched,VBN,O
6,Sentence: 1,london,NNP,B-geo
8,Sentence: 1,protest,VB,O


In [7]:
tags   = list(set(df.POS.values))
words  = list(set(df.Word.values))
words_ = list(set(df_.Word.values)) 

len(tags), len(words), len(words_)

(42, 29764, 29763)

In [8]:
x, y = df.drop('POS', axis = 1), df.POS
gs   = GroupShuffleSplit(n_splits = 2, test_size = .33, random_state = 42)

train_idx, test_idx = next(gs.split(x, y, groups = df['sentence']))
train_data          = df.loc[train_idx]
test_data           = df.loc[test_idx]

In [9]:
x_, y_ = df_.drop('POS', axis = 1), df_.POS
df_.reset_index(drop = True, inplace = True)

gs = GroupShuffleSplit(n_splits = 2, test_size = .33, random_state = 42)
train_idx_, test_idx_ = next(gs.split(x_, y_, groups = df_['sentence']))
train_data_           = df_.loc[train_idx_]
test_data_            = df_.loc[test_idx_]

In [10]:
dfupdate      = train_data.sample(frac = .15, replace = False, random_state = 42)
dfupdate.Word = 'UNKNOWN'
train_data.update(dfupdate)

words   = list(set(train_data.Word.values))
word2id = {w   : idx for idx, w in enumerate(words)}
tag2id  = {t   : idx for idx, t in enumerate(tags)}
id2tag  = {idx :   t for idx, t in enumerate(tags)}

len(tags), len(words)

(42, 23607)

In [11]:
count_tags              = dict(train_data.POS.value_counts())
count_tags_to_words     = train_data.groupby(['POS']).apply(lambda grp: grp.groupby('Word')['POS'].count().to_dict()).to_dict()
count_init_tags         = dict(train_data.groupby('sentence').first().POS.value_counts())

count_tags_to_next_tags = np.zeros((len(tags), len(tags)), dtype = int)
sentences               = list(train_data.sentence)
pos                     = list(train_data.POS)

In [12]:
for idx, sentence in tqdm(enumerate(sentences), position = 0, leave = True):
    
    if (idx > 0) and (sentence == sentences[idx - 1]):
        
        prev_tagid = tag2id[pos[idx - 1]]
        next_tagid = tag2id[pos[idx]]
        
        count_tags_to_next_tags[prev_tagid][next_tagid] += 1

702936it [00:00, 1070628.31it/s]


In [13]:
start_prob            = np.zeros((len(tags), ))
trans_mat             = np.zeros((len(tags), len(tags)))
emission_prob         = np.zeros((len(tags), len(words)))
num_sentences         = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis = 1)

In [14]:
for tag, tagid in tqdm(tag2id.items(), position = 0, leave = True):
    
    float_counttag    = float(count_tags.get(tag, 0))
    start_prob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    
    for word, wordid in word2id.items():
        emission_prob[tagid][wordid] = count_tags_to_words.get(tag, {}).get(word, 0) / float_counttag
        
    
    for tag_, tagid_ in tag2id.items():
        trans_mat[tagid][tagid_] = count_tags_to_next_tags[tagid][tagid_] / sum_tags_to_next_tags[tagid]

100%|██████████| 42/42 [00:00<00:00, 87.97it/s]


In [15]:
count_words = {}
for word in train_data.Word.values: count_words[word] = count_words.get(word, 0) + 1

count_word_transitions = {}
for sentence in train_data.groupby('sentence'):
    
    words = sentence[1]['Word'].values
    for idx in range(len(words) - 1):
        
        w1, w2 = words[idx], words[idx + 1]
        if w1 not in count_word_transitions:
            count_word_transitions[w1] = {}
            
        count_word_transitions[w1][w2] = count_word_transitions[w1].get(w2, 0) + 1

In [16]:
word_transition_matrix  = np.zeros((len(word2id) + 1, len(word2id) + 1))
sum_words_to_next_words = np.sum([
                                    count_word_transitions[w1][w2] for w1 in count_word_transitions
                                    for w2 in count_word_transitions[w1]
                                ])
for w1, w1id in tqdm(word2id.items()):
    for w2, w2id in word2id.items():
        word_transition_matrix[w1id][w2id] = count_word_transitions.get(w1, {}).get(w2, 0) / sum_words_to_next_words
       
print(word_transition_matrix.shape)

100%|██████████| 23607/23607 [05:06<00:00, 77.13it/s]

(23608, 23608)





In [17]:
def calculate_log_likelihood(sentence: List[str], word_transition_matrix) -> float:
    
    sentence_ids    = [word2id.get(w, word2id['UNKNOWN']) for w in sentence]
    log_likelihood = np.log(word_transition_matrix[sentence_ids[0]][sentence_ids[1]])
    
    for idx in range(1, len(sentence_ids) - 1):
        
        log_likelihood += np.log(word_transition_matrix[sentence_ids[idx]][sentence_ids[idx + 1]] + 1e-10)
        
    return log_likelihood

In [18]:
calculate_log_likelihood(['this', 'is', 'a', 'test', 'sentence'], word_transition_matrix)

-41.259970813020175

In [19]:
model = hmm.MultinomialHMM(n_components = len(tags), algorithm = 'viterbi',
                           random_state = 42)

model.startprob_    = start_prob
model.transmat_     = trans_mat
model.emissionprob_ = emission_prob

In [20]:
test_data.loc[~test_data['Word'].isin(words), 'Word'] = 'UNKNOWN'
word_test = list(test_data.Word)
samples   = []

for idx, val in enumerate(word_test):
    samples.append([word2id[val]])
    
lengths, count = [], 0
sentences      = list(test_data.sentence)

for idx, sentence in tqdm(enumerate(sentences), position = 0, leave = True):
    
    if (idx > 0) and (sentence == sentences[idx - 1]):
        count += 1
        
    elif idx > 0:
        lengths.append(count)
        count = 1
        
    else: 
        count = 1

345639it [00:00, 2422453.78it/s]


In [21]:
pos_predict = model.predict(samples, lengths)
pos_predict

array([32, 29,  6, ...,  7,  0, 27], dtype=int32)

In [22]:
tags_test = list(test_data.POS)
pos_test  = np.zeros((len(tags_test), ), dtype = int)

for idx, val in enumerate(tags_test):
    pos_test[idx] = tag2id[val]
    
len(pos_predict), len(pos_test), len(samples), len(word_test)

(345615, 345639, 345639, 345639)

In [23]:
def report(pred, gt):
    
    print(f'The  accuracy is {accuracy_score(gt, pred) * 100:.2f}%')
    print(f'The precision is {precision_score(gt, pred, average = "weighted"):.2f}')
    print(f'The    recall is {recall_score(gt, pred, average = "weighted"):.2f}')
    print(f'The  F1 Score is {f1_score(gt, pred, average = "weighted"):.2f}')
    

min_length = min(len(pos_predict), len(pos_test))
report(pos_predict[:min_length], pos_test[:min_length])

The  accuracy is 28.67%
The precision is 0.29


  _warn_prf(average, modifier, msg_start, len(result))


The    recall is 0.29
The  F1 Score is 0.21
