In [1]:
import yaml    # YAML Encode-Decode library
import numpy as np    # Scientific computational library
from sklearn.feature_extraction import DictVectorizer    # Dictorinary to Vector
from nltk.tag import pos_tag    # Part of Speech tagger
from sklearn.naive_bayes import BernoulliNB    # Classifier

In [2]:
# Reading Entity Chat Data
with open('ent.yaml', 'r') as f:
    data =  yaml.load(f)

In [3]:
# Features To Train
def get_features(index, word, tokens):
    prev_word = 'BOS'
    next_word = 'EOS'
    if len(tokens) > index+1:
        next_word = tokens[index+1]
    if index-1 > 0:
        prev_word = tokens[index-1]
    val, tag = pos_tag([word])[0]
    prev_word, prev_tag = pos_tag([prev_word])[0]
    next_word, next_tag = pos_tag([next_word])[0]
    dic = {
        "word": val,
        "postag": tag,
        "nextword": next_word,
        "nextwordtag": next_tag,
        "previousword": prev_word,
        "previoustag": prev_tag,
    }
    return dic

In [4]:
train_data = []
label = []

# Separating Features i.e Feature Dictionary and Labels i.e Name of Entity
for dic in data:
    # Split sentence to tokens
    token = dic['text'].split(' ')
    for i, word in enumerate(token):
        # Appending Labels if its found else appending default label "O".
        if dic.__contains__('entity'):
            for ent in dic['entity']:
                pos = ent['pos']
                k, v = list(ent.items())[1]
                if pos == i:
                    label.append(k)
                    break
            else:
                label.append('O')
        else:
            label.append('O')
        # Text to Features
        train_data.append(get_features(i ,word, token))

In [5]:
# Conveting Dictionary to Vector
vec = DictVectorizer()
feature = vec.fit_transform(train_data).toarray()

In [6]:
# Fitting Model
bnb = BernoulliNB(alpha=0.1, binarize=0.1)
bnb.fit(feature, label)

BernoulliNB(alpha=0.1, binarize=0.1, class_prior=None, fit_prior=True)

In [7]:
# Get Entity Result
def get_enitity(sentence):
    new = []
    tokens = sentence.split(" ")
    for i,val in enumerate(tokens):
        new.append(get_features(i, val, tokens))
   
    pred = bnb.predict(vec.transform(new).toarray())
    for i in np.where(pred!='O')[0]:
        print("Entity: {}, Value: {}".format(pred[i],sentence.split(" ")[i]))

In [8]:
user_text_list = [
    "hi",
    "i am akshay",
    "what is mul of 1 and 2"
]

for text in user_text_list:
    print(text)
    get_enitity(text)
    print("-----------------------")

hi
-----------------------
i am akshay
Entity: name, Value: akshay
-----------------------
what is mul of 1 and 2
Entity: name, Value: what
Entity: number, Value: 1
Entity: number, Value: 2
-----------------------
