In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import math
from typing import List

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
nlp = spacy.load("en_core_web_trf")

In [3]:
glove_file = datapath('/home/aum/Desktop/projects/nlp/models/glove.6B.100d.txt')
# glove_file = datapath('/home/aumaron/Desktop/nlp/nlp_playground/models/glove.6B/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [4]:
model[model.index_to_key[0]].shape[0]

100

In [5]:
entity_data = pd.read_excel("../data/entity_data.xlsx")
root_word_corpus = pd.read_excel("../data/root_word_corpus.xlsx")
column_names = entity_data["column_names"].tolist()

In [6]:
# column_names

In [7]:
intermediate_corpus = root_word_corpus[['id', 'name', 'entity']].to_dict('records')

In [8]:
column_name = 'taluka_name'
column_name =  re.sub(r'[@_!#$%^&*()<>?[\]./\\|}{~:-]', ' ', column_name)  # Removal of special characters
column_name = re.sub(r"[ ]{2,}", " ", column_name)  # Remove additional spaces
print(column_name)

taluka name


1. Find NER using Spacy
2. Find individual tokens in intermediate corpus
3. If not found in step 2, find semantically similar words in R^d 100 dimensional space
4. Future scope:
    % is removed as a character
    Certain columns containing '%' in the beginning or end are percentage columns
    Need to add the exception for %
    
5 Challenges -
    - latitude-longitude

In [9]:
# %%timeit

def model_based_ner(string: str) -> list:
    word = nlp(column_name)
    entity_list = []
    for token in word:
#         print(f"{token.ent_iob_} -> {token.ent_type_}")
        if token.ent_type_:
            entity_list.append(token.ent_type_)
    
    return entity_list

In [10]:
# %%timeit

def find_in_corpus(root_word: str, word_corpus: List[dict]) -> list:
    entity_list = []
    filtered_list = list(filter(lambda word_meta: word_meta["name"] == root_word, word_corpus))
    if filtered_list:
        for each_object in filtered_list:
            entity_list.append({each_object.get("name"): each_object.get("entity")})
    else:
        entity_list = [{root_word: ""}]
    
    return entity_list

In [11]:
# %%timeit

def construct_corpus_matrix(word_corpus: List[dict], embedding_model):
    word_array = np.empty([len(word_corpus), embedding_model[embedding_model.index_to_key[0]].shape[0]])
    for row_number, root in enumerate(word_corpus):
        try:
            word_array[row_number, :] = embedding_model.get_vector(root.get("name"))
        except KeyError:
            word_array[row_number, :] = np.zeros([100,])
        
    return word_array

# corpus_array = construct_corpus_matrix(intermediate_corpus, model)

In [12]:
# %%timeit

def vector_cosine(a, b):
    cos_theta = a.dot(b.T)/(np.sqrt(np.sum(np.square(a)))*(np.sqrt(np.sum(np.square(b)))))
    _angle = math.acos(cos_theta)

    return cos_theta, _angle


def get_closest_word(root_word_embedding: np.ndarray, 
                     matrix: np.ndarray):
    theta_list = []
    angle_list = []
    for column_vec in range(matrix.T.shape[1]):
        doc_product, angle_between_vectors = vector_cosine(root_word_embedding, matrix.T[:, column_vec])
        theta_list.append(doc_product)
        angle_list.append(angle_between_vectors)
    # TODO: Can add cut-offs
    print(theta_list)
    print(theta_list.index(max(theta_list)))
    print(theta_list[theta_list.index(max(theta_list))])
    print(angle_list.index(min(angle_list)))
    
    

In [13]:
# Step 0
corpus_array = construct_corpus_matrix(intermediate_corpus, model)
print(corpus_array.shape)

# Step 1 
entity_from_model = model_based_ner(column_name)
print(entity_from_model)

# Step 2: Find in corpus
if not entity_from_model:
    word_split = column_name.split(" ")
    # Step 2.a: Find sub_words in corpus
    entity_from_corpus = []
    for word in word_split:
        entity_from_corpus.extend(find_in_corpus(word, intermediate_corpus))  # Can be replaced using a mat mul
    print(entity_from_corpus)
    
    # Step 2.b: Find if the last sub_word has returned an entity
    # Note: There can be 4 possibilities:
        # 1. All sub words can return entity
        # 2. Any sub_word other than the trailing sub_word returns an entity
        # 3. Any sub_word including the trailing sub_word returns an entity
        # 4. None of them return an entity
    
    # Check if last word has empty entity
    last_sub_word_entity = list(filter(lambda word_is: word_is.get(word_split[-1]) == "", entity_from_corpus))
    print(last_sub_word_entity)
    
    # If non-empty, then check this word in corpus
    if last_sub_word_entity:
        theta_list = []
        last_word_embedding = model.get_vector(word_split[-1])
        get_closest_word(last_word_embedding, corpus_array)
        
        
    
    


# corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# corpus_array.shape

(66, 100)
['PERSON']


In [41]:
intermediate_corpus[60]

{'id': 60, 'name': 'profit', 'entity': 'Revenue'}