In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import math
import json
from dataclasses import dataclass, field
from typing import List, Optional

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Transformer libraries
from sentence_transformers import SentenceTransformer # For estimating the distance between (sub)sequences
from sentence_transformers import util

In [2]:
# Instantiate SBERT
sentence_model = SentenceTransformer('all-mpnet-base-v2')

In [2]:
# Load spacy language model
nlp = spacy.load("en_core_web_trf")

# Load the GloVe embeddings
glove_file = datapath('/home/aum/Desktop/projects/nlp/models/glove.6B.100d.txt')
# glove_file = datapath('/home/aumaron/Desktop/nlp/nlp_playground/models/glove.6B/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [3]:
# Dimensional space of the embeddings

model[model.index_to_key[0]].shape[0]

100

In [4]:
entity_data = pd.read_excel("../data/entity_data.xlsx")
root_word_corpus = pd.read_excel("../data/root_word_corpus.xlsx")
column_names = entity_data["column_names"].tolist()
old_entity_corpus = open("../misc/corpus_transformed.json")
old_entity_corpus = json.load(old_entity_corpus)

In [5]:
# column_names

In [6]:
intermediate_corpus = root_word_corpus[['id', 'name', 'entity']].to_dict('records')

In [8]:
column_name = 'branch/name'
column_name =  re.sub(r'[@_!#$%^&*()<>?[\]./\\+|}{~:-]', ' ', column_name)  # Removal of special characters
column_name = re.sub(r"[ ]{2,}", " ", column_name)  # Remove additional spaces
print(column_name)

branch name


In [9]:
# 

query_embedding = sentence_model.encode([column_name])

1. Find NER using Spacy
2. Find individual tokens in intermediate corpus
3. If not found in step 2, find semantically similar words in R^d 100 dimensional space
4. Future scope:
    % is removed as a character
    Certain columns containing '%' in the beginning or end are percentage columns
    Need to add the exception for %
    
5. Challenges -
    - False positives in model based NER
    
6. Future scope and experimentation
    - Implement a heuristic algorithm to funnel out wrong predictions from the predicted universe
    - Use randomly selected values to funnel out entities further
    - Try 200d vectors
    - Try 300d vectors
    - Try 768d BERT embeddings (non context specific word embeddings)
    - Try extracting phrase (in this case column names) embeddings as a whole

In [7]:
# %%timeit

def model_based_ner(string: str) -> list:
    word = nlp(column_name)
    entity_list = []
    for token_num, token in enumerate(word):
#         print(f"{token.text} -> {token.ent_iob_} -> {token.ent_type_}")
        if token.ent_type_:
            entity_list.append(token.ent_type_)
    
    return entity_list

In [8]:
# %%timeit

@dataclass
class CorpusSearch:
    column_name: str
    word_corpus: List[dict]
    word_matrix: np.ndarray
    confidence_required: float
    embedding_model: KeyedVectors
    entity_from_corpus: list = field(default_factory=list)
    
    def __post_init__(self):
        word_split = self.column_name.split(" ")
        for word_num, word in enumerate(word_split):
            self.word_id = word_num
            self.root_word = word
            try:
                self.root_word_embedding = self.embedding_model.get_vector(self.root_word)
            except KeyError:
                self.root_word_embedding = np.zeros((100,))
            self.last_word_embedding = self.root_word_embedding
            self.entity_from_corpus.extend(self.find_in_corpus())
            
#         # Check if last word has empty entity
#         last_sub_word_entity = list(filter(
#             lambda word_is: (word_is.get("actual_word") == word_split[-1]) and (not word_is.get("entity")), 
#             self.entity_from_corpus))
# #         print('Last word entity -> ', last_sub_word_entity)
#         # If non-empty, then check this word in corpus
#         if last_sub_word_entity:
#             theta_list = []
#             word_id = last_sub_word_entity[-1].get("word_id")
#             self.last_word_embedding = self.root_word_embedding.squeeze()[:100]
#             closest_index = self.get_closest_word()

#             if closest_index or closest_index == 0:
#                 closest_entity = self.word_corpus[closest_index] if (closest_index or closest_index == 0) else ""
#                 print(closest_entity)
#                 self.entity_from_corpus[-1].update({"actual_word": word_split[-1],
#                                                "entity": [closest_entity["entity"]],
#                                                "closest_root": closest_entity["name"],
#                                                "word_id": word_id})
#             else:
#                 print(f"--------\nNo entities found!\n--------")

#             print('Updated entity object -> ', self.entity_from_corpus) 
    
    def find_in_corpus(self) -> list:
        entity_list = []
        filtered_list = list(filter(lambda word_meta: word_meta["name"] == self.root_word, 
                                    self.word_corpus))
        meta_obj = {
            "actual_word": self.root_word,
            "entity": [],
            "closest_root:": self.root_word,
            "word_id": self.word_id
        }
        if filtered_list:
            for each_object in filtered_list:
                meta_obj["entity"].append(each_object.get("entity"))
            entity_list.append(meta_obj)
        else:
            closest_index = self.get_closest_word()
            if closest_index or closest_index == 0:
                closest_entity = self.word_corpus[closest_index] if (closest_index or closest_index == 0) else ""
                entity_list = [{"actual_word": self.root_word,
                                "entity": [closest_entity["entity"]],
                                "closest_root": closest_entity["name"],
                                "word_id": self.word_id}]
            else:
                entity_list = [{"actual_word": self.root_word,
                                "entity": [],
                                "closest_root": "",
                                "word_id": self.word_id}]

        return entity_list
    
    @staticmethod
    def embedding_product(a, b):
        cos_theta = a.dot(b.T)/(np.sqrt(np.sum(np.square(a)))*(np.sqrt(np.sum(np.square(b)))))
        _angle = math.acos(cos_theta)

        return cos_theta, _angle


    def get_closest_word(self):
        theta_list = []
        angle_list = []
        for column_vec in range(self.word_matrix.T.shape[1]):
            doc_product, angle_between_vectors = self.embedding_product(self.last_word_embedding, 
                                                                        self.word_matrix.T[:, column_vec])
            theta_list.append(doc_product)
            angle_list.append(angle_between_vectors)  # If needed for validation

        # Cut-off: filter theta list based on the confidence required
        filtered_theta_index_list = [
            theta_list.index(score) for score in theta_list if np.abs(score)>=self.confidence_required]

        # Find index of the top score
        closest_vector = None
        if filtered_theta_index_list:
            closest_vector = theta_list.index(max([theta_list[filter_index] for filter_index in filtered_theta_index_list]))
#         print("closest_vect",closest_vector)
    #     print(theta_list)
#         print(theta_list.index(max(theta_list)))
    #     print(theta_list[theta_list.index(max(theta_list))])
    #     print(angle_list.index(min(angle_list)))

        return closest_vector


In [9]:
# %%timeit

def construct_corpus_matrix(word_corpus: List[dict], embedding_model):
    word_array = np.empty([len(word_corpus), embedding_model[embedding_model.index_to_key[0]].shape[0]])
    for row_number, root in enumerate(word_corpus):
        try:
            word_array[row_number, :] = embedding_model.get_vector(root.get("name"))
        except KeyError:
            word_array[row_number, :] = np.zeros([100,])
        
    return word_array

# corpus_array = construct_corpus_matrix(intermediate_corpus, model)

In [10]:
# %%timeit

In [11]:
# Step 0
corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# print(corpus_array.shape)

entity_from_old_method = []
entity_from_model_list = []
entity_from_search = []
for column_name in entity_data["column_names"].values:
    old_entity_list = [
        word_obj["entity"] for word_obj in list(filter(lambda corpus: corpus.get("name") == column_name, old_entity_corpus))
    ]
#     print("old entities", old_entity_list)
    old_entity_list = list(set(old_entity_list))
    if not old_entity_list:
        old_entity_list = [""]
    entity_from_old_method.append(old_entity_list)
    
    column_name =  re.sub(r'[@_!#$%^&*()<>?[\]./\\+|}{~:-]', ' ', column_name)  # Removal of special characters
    column_name = re.sub(r"[ ]{2,}", " ", column_name)  # Remove additional spaces
    # Step 1 
    entity_from_model = model_based_ner(column_name)
    print('Model NER -> ', entity_from_model)
    if entity_from_model:
        entity_from_model = list(set(entity_from_model))
    else:
        entity_from_model = [""]
    entity_from_model_list.append(entity_from_model)

    # Step 2: Find in corpus
    corpus_search_obj = CorpusSearch(
        column_name=column_name,
        word_corpus=intermediate_corpus,
        word_matrix=corpus_array,
        confidence_required=0.,
        embedding_model=model,
    )

    print('Search in Corpus -> ', corpus_search_obj.entity_from_corpus)

    # Check if no entities found
    non_identified_entity = list(filter(lambda word_is: not word_is.get("entity"), 
                                        corpus_search_obj.entity_from_corpus))
    if non_identified_entity and (len(non_identified_entity) == len(column_name.split(" "))):
        print(f"--------\nNo entities found!\n--------")

    extract_entities = [word_obj["entity"] for word_obj in corpus_search_obj.entity_from_corpus]
    extract_entities = [word for word_list in extract_entities for word in list(set(word_list))]
#     print(extract_entities)
    
    entity_from_search.append(", ".join(extract_entities))
    for i, word in enumerate(entity_from_search):
        if word == "":
            entity_from_search.pop(i)
            entity_from_search.insert(i, "Default")


entity_from_old_method = [word for word_list in entity_from_old_method for word in word_list]
entity_from_model_list = [word for word_list in entity_from_model_list for word in word_list]

entity_data["old_recognition_method"] = entity_from_old_method
entity_data["model_based_recognition"] = entity_from_model_list
entity_data["new_algorithm_based_recognition"] = entity_from_search

# if not entity_from_model:
#     word_split = column_name.split(" ")
#     # Step 2.a: Find sub_words in corpus
#     entity_from_corpus = []
#     for word_num, word in enumerate(word_split):
#         entity_from_corpus.extend(find_in_corpus(word, word_num, intermediate_corpus))  # Can be replaced using a mat mul
#     print('Simple search in Corpus -> ', entity_from_corpus)
    
#     # Step 2.b: Find if the last sub_word has returned an entity
#     # Note: There can be 4 possibilities:
#         # 1. All sub words can return entity
#         # 2. Any sub_word other than the trailing sub_word returns an entity
#         # 3. Any sub_word including the trailing sub_word returns an entity
#         # 4. None of them return an entity
    
#     # 2.b.1: Check if all words contain entity
#     non_identified_entity = list(filter(lambda word_is: not word_is.get("entity"), entity_from_corpus))
#     if non_identified_entity and (len(non_identified_entity) == len(word_split)):
#         print(f"--------\nNo entities found!\n--------")
        
#     # 2.b.2:
    
    
#     # 2.b.2: Check if last word has empty entity
#     last_sub_word_entity = list(filter(
#         lambda word_is: (word_is.get("actual_word") == word_split[-1]) and (not word_is.get("entity")), 
#         entity_from_corpus))
#     print('Last word entity -> ', last_sub_word_entity)
    
#     # If non-empty, then check this word in corpus
#     if last_sub_word_entity:
#         theta_list = []
#         word_id = last_sub_word_entity[-1].get("word_id")
#         last_word_embedding = model.get_vector(word_split[-1])
#         closest_index = get_closest_word(last_word_embedding.squeeze()[:100], corpus_array, 0.3)
        
#         if closest_index:
#             closest_entity = intermediate_corpus[closest_index] if (closest_index or closest_index == 0) else ""
#             entity_from_corpus[-1].update({"actual_word": word_split[-1],
#                                            "entity": [closest_entity["entity"]],
#                                            "closest_root": closest_entity["name"],
#                                            "word_id": word_id})
#         else:
#             print(f"--------\nNo entities found!\n--------")
        
#         print('Updated entity object -> ', entity_from_corpus)
    
    


# # corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# # corpus_array.shape

Model NER ->  []
Search in Corpus ->  [{'actual_word': 'date', 'entity': ['Date'], 'closest_root:': 'date', 'word_id': 0}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'state', 'entity': ['Location'], 'closest_root:': 'state', 'word_id': 0}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'city', 'entity': ['Location'], 'closest_root:': 'city', 'word_id': 0}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'retailer', 'entity': ['Person'], 'closest_root:': 'retailer', 'word_id': 0}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'store', 'entity': ['Sales'], 'closest_root': 'sold', 'word_id': 0}, {'actual_word': 'type', 'entity': ['Product'], 'closest_root:': 'type', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'sales', 'entity': ['Sales'], 'closest_root:': 'sales', 'word_id': 0}, {'actual_word': 'value', 'entity': ['Currency', 'Sales'], 'closest_root:': 'value', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word

Model NER ->  []
Search in Corpus ->  [{'actual_word': 'receipt', 'entity': ['Cost'], 'closest_root': 'invoice', 'word_id': 0}, {'actual_word': 'num', 'entity': ['Product'], 'closest_root': 'sku', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'po', 'entity': ['Location'], 'closest_root': 'branch', 'word_id': 0}, {'actual_word': 'num', 'entity': ['Product'], 'closest_root': 'sku', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'basic', 'entity': ['Currency'], 'closest_root': 'value', 'word_id': 0}, {'actual_word': 'rate', 'entity': ['Percent'], 'closest_root:': 'rate', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'vendor', 'entity': ['Organisation'], 'closest_root:': 'vendor', 'word_id': 0}, {'actual_word': 'name', 'entity': ['Person'], 'closest_root:': 'name', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'vendor', 'entity': ['Organisation'], 'closest_root:': 'vendor', 'word_id': 0}, {'actua

Search in Corpus ->  [{'actual_word': 'rcv', 'entity': ['Product'], 'closest_root': 'sku', 'word_id': 0}, {'actual_word': 'user', 'entity': ['Person'], 'closest_root:': 'user', 'word_id': 1}, {'actual_word': 'details', 'entity': ['Date'], 'closest_root': 'date', 'word_id': 2}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'po', 'entity': ['Location'], 'closest_root': 'branch', 'word_id': 0}, {'actual_word': 'user', 'entity': ['Person'], 'closest_root:': 'user', 'word_id': 1}, {'actual_word': 'details', 'entity': ['Date'], 'closest_root': 'date', 'word_id': 2}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'cost', 'entity': ['Cost'], 'closest_root:': 'cost', 'word_id': 0}, {'actual_word': 'center', 'entity': ['Location', 'Organisation'], 'closest_root:': 'center', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'acc', 'entity': ['Person'], 'closest_root': 'team', 'word_id': 0}, {'actual_word': 'description', 'entity': ['Location'], 'closest_root':

Model NER ->  []
Search in Corpus ->  [{'actual_word': 'sub', 'entity': ['Location'], 'closest_root': 'zone', 'word_id': 0}, {'actual_word': 'brand', 'entity': ['Product'], 'closest_root:': 'brand', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'mr', 'entity': ['Person'], 'closest_root': 'name', 'word_id': 0}, {'actual_word': 'hq', 'entity': ['Location'], 'closest_root': 'headquarter', 'word_id': 1}, {'actual_word': 'id', 'entity': ['Person'], 'closest_root': 'user', 'word_id': 2}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'net', 'entity': ['Revenue'], 'closest_root': 'profit', 'word_id': 0}, {'actual_word': 'sales', 'entity': ['Sales'], 'closest_root:': 'sales', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'net', 'entity': ['Revenue'], 'closest_root': 'profit', 'word_id': 0}, {'actual_word': 'sales', 'entity': ['Sales'], 'closest_root:': 'sales', 'word_id': 1}, {'actual_word': 'ly', 'entity': ['Product'], 'closest_root': 's

Model NER ->  []
Search in Corpus ->  [{'actual_word': 'sales', 'entity': ['Sales'], 'closest_root:': 'sales', 'word_id': 0}, {'actual_word': 'target', 'entity': ['Cost'], 'closest_root': 'cost', 'word_id': 1}, {'actual_word': 'diff', 'entity': ['Product'], 'closest_root': 'sku', 'word_id': 2}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'position', 'entity': ['Location'], 'closest_root': 'place', 'word_id': 0}, {'actual_word': 'id', 'entity': ['Person'], 'closest_root': 'user', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'status', 'entity': ['Location'], 'closest_root': 'place', 'word_id': 0}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'vacancy', 'entity': ['Percent'], 'closest_root': 'rate', 'word_id': 0}, {'actual_word': 'substatus', 'entity': [], 'closest_root': '', 'word_id': 1}]
Model NER ->  []
Search in Corpus ->  [{'actual_word': 'vacancy', 'entity': ['Percent'], 'closest_root': 'rate', 'word_id': 0}, {'actual_word': 'open', 'en

In [12]:
entity_data.to_excel("validation_output_final_.xlsx", index=False)

In [30]:
corpus_search_obj.entity_from_corpus

[{'actual_word': 'kk', 'entity': [], 'closest_root': '', 'word_id': 0},
 {'actual_word': 'kk', 'entity': [], 'closest_root': '', 'word_id': 1}]

In [101]:
intermediate_corpus[26]

{'id': 26, 'name': 'company', 'entity': 'Organisation'}

In [19]:
util.cos_sim(model.get_vector("organization"), model.get_vector("organisation"))

tensor([[0.8785]])

In [29]:
sneaker = model.get_vector("man")
shoes = model.get_vector("woman")

dot_prod, angle = vector_cosine(sneaker, shoes)
print(dot_prod, angle)

0.83234936 0.5874632370639816
