In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import math
from typing import List, Optional

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Transformer libraries
from sentence_transformers import SentenceTransformer # For estimating the distance between (sub)sequences
from sentence_transformers import util

In [29]:
# Instantiate SBERT
sentence_model = SentenceTransformer('all-mpnet-base-v2')

Collecting en-core-web-trf==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl (460.2 MB)
     |████████████████████████████████| 460.2 MB 87 kB/s              


Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [2]:
# Load spacy language model
nlp = spacy.load("en_core_web_trf")

# Load the GloVe embeddings
glove_file = datapath('/home/aum/Desktop/projects/nlp/models/glove.6B.100d.txt')
# glove_file = datapath('/home/aumaron/Desktop/nlp/nlp_playground/models/glove.6B/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


In [3]:
# Dimensional space of the embeddings

model[model.index_to_key[0]].shape[0]

100

In [4]:
entity_data = pd.read_excel("../data/entity_data.xlsx")
root_word_corpus = pd.read_excel("../data/root_word_corpus.xlsx")
column_names = entity_data["column_names"].tolist()

In [5]:
# column_names

In [6]:
intermediate_corpus = root_word_corpus[['id', 'name', 'entity']].to_dict('records')

In [18]:
column_name = 'order_id'
column_name =  re.sub(r'[@_!#$%^&*()<>?[\]./\\|}{~:-]', ' ', column_name)  # Removal of special characters
column_name = re.sub(r"[ ]{2,}", " ", column_name)  # Remove additional spaces
print(column_name)

order id


In [39]:
# 

query_embedding = sentence_model.encode([column_name])

(100,)

1. Find NER using Spacy
2. Find individual tokens in intermediate corpus
3. If not found in step 2, find semantically similar words in R^d 100 dimensional space
4. Future scope:
    % is removed as a character
    Certain columns containing '%' in the beginning or end are percentage columns
    Need to add the exception for %
    
5. Challenges -
    - False positives in model based NER
    
6. Future scope and experimentation
    - Try 200d vectors
    - Try 300d vectors
    - Try 768d BERT embeddings (non context specific word embeddings)
    - Try extracting phrase (in this case column names) embeddings as a whole

In [20]:
# %%timeit

def model_based_ner(string: str) -> list:
    word = nlp(column_name)
    entity_list = []
    for token in word:
        print(f"{token.ent_iob_} -> {token.ent_type_}")
        if token.ent_type_:
            entity_list.append(token.ent_type_)
    
    return entity_list

In [9]:
# %%timeit

def find_in_corpus(root_word: str, word_corpus: List[dict]) -> list:
    entity_list = []
    filtered_list = list(filter(lambda word_meta: word_meta["name"] == root_word, word_corpus))
    if filtered_list:
        for each_object in filtered_list:
            entity_list.append({each_object.get("name"): each_object.get("entity")})
    else:
        entity_list = [{root_word: ""}]
    
    return entity_list

In [10]:
# %%timeit

def construct_corpus_matrix(word_corpus: List[dict], embedding_model):
    word_array = np.empty([len(word_corpus), embedding_model[embedding_model.index_to_key[0]].shape[0]])
    for row_number, root in enumerate(word_corpus):
        try:
            word_array[row_number, :] = embedding_model.get_vector(root.get("name"))
        except KeyError:
            word_array[row_number, :] = np.zeros([100,])
        
    return word_array

# corpus_array = construct_corpus_matrix(intermediate_corpus, model)

In [11]:
# %%timeit

def embedding_product(a, b):
    cos_theta = a.dot(b.T)/(np.sqrt(np.sum(np.square(a)))*(np.sqrt(np.sum(np.square(b)))))
    _angle = math.acos(cos_theta)

    return cos_theta, _angle


def get_closest_word(root_word_embedding: np.ndarray, 
                     matrix: np.ndarray,
                     confidence_required: Optional[float] = 0.5):
    theta_list = []
    angle_list = []
    for column_vec in range(matrix.T.shape[1]):
        doc_product, angle_between_vectors = embedding_product(root_word_embedding, matrix.T[:, column_vec])
        theta_list.append(doc_product)
        angle_list.append(angle_between_vectors)  # If needed for validation
    
    # Cut-off: filter theta list based on the confidence required
    filtered_theta_index_list = [theta_list.index(score) for score in theta_list if np.abs(score)>=confidence_required]
    
    # Find index of the top score
    closest_vector = None
    if filtered_theta_index_list:
        closest_vector = theta_list.index(max([theta_list[filter_index] for filter_index in filtered_theta_index_list]))
        
#     print(theta_list)
#     print(theta_list.index(max(theta_list)))
#     print(theta_list[theta_list.index(max(theta_list))])
#     print(angle_list.index(min(angle_list)))
    
    return closest_vector


In [19]:
# Step 0
corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# print(corpus_array.shape)

# Step 1 
entity_from_model = model_based_ner(column_name)
print('Model NER -> ', entity_from_model)

# Step 2: Find in corpus
if not entity_from_model:
    word_split = column_name.split(" ")
    # Step 2.a: Find sub_words in corpus
    entity_from_corpus = []
    for word in word_split:
        entity_from_corpus.extend(find_in_corpus(word, intermediate_corpus))  # Can be replaced using a mat mul
    print('Simple search in Corpus -> ', entity_from_corpus)
    
    # Step 2.b: Find if the last sub_word has returned an entity
    # Note: There can be 4 possibilities:
        # 1. All sub words can return entity
        # 2. Any sub_word other than the trailing sub_word returns an entity
        # 3. Any sub_word including the trailing sub_word returns an entity
        # 4. None of them return an entity
    
    # 2.b.1: Check if all words contain
    # 2.b.2: Check if last word has empty entity
    last_sub_word_entity = list(filter(lambda word_is: word_is.get(word_split[-1]) == "", entity_from_corpus))
    print('Last word entity -> ', last_sub_word_entity)
    
    # If non-empty, then check this word in corpus
    if last_sub_word_entity:
        theta_list = []
        last_word_embedding = model.get_vector(word_split[-1])
        closest_index = get_closest_word(last_word_embedding.squeeze()[:100], corpus_array, 0)
        closest_entity = intermediate_corpus[closest_index] if (closest_index or closest_index == 0) else ""
        entity_from_corpus[-1].update({word_split[-1]: closest_entity[]})
        print('Updated entity object -> ', entity_from_corpus)
    
    


# corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# corpus_array.shape

O -> 
O -> 
O -> 
Model NER ->  []
Simple search in Corpus ->  [{'order': ''}, {'id': ''}]
Last word entity ->  [{'id': ''}]
Updated entity object ->  [{'order': ''}, {'id': {'id': 35, 'name': 'user', 'entity': 'Person'}}]


  cos_theta = a.dot(b.T)/(np.sqrt(np.sum(np.square(a)))*(np.sqrt(np.sum(np.square(b)))))


In [20]:
intermediate_corpus[26]

{'id': 26, 'name': 'vendor', 'entity': 'Organisation'}

In [29]:
sneaker = model.get_vector("man")
shoes = model.get_vector("woman")

dot_prod, angle = vector_cosine(sneaker, shoes)
print(dot_prod, angle)

0.83234936 0.5874632370639816
