In [119]:
import pandas as pd
import numpy as np
import spacy
import re
from typing import List

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [19]:
nlp = spacy.load("en_core_web_trf")

In [122]:
glove_file = datapath('/home/aum/Desktop/projects/nlp/models/glove.6B.100d.txt')
# glove_file = datapath('/home/aumaron/Desktop/nlp/nlp_playground/models/glove.6B/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [173]:
model[model.index_to_key[0]].shape[0]

100

In [7]:
entity_data = pd.read_excel("../data/entity_data.xlsx")
root_word_corpus = pd.read_excel("../data/root_word_corpus.xlsx")
column_names = entity_data["column_names"].tolist()

In [120]:
# column_names

In [201]:
intermediate_corpus = root_word_corpus[['id', 'name', 'entity']].to_dict('records')

In [106]:
column_name = 'branch_manager'
column_name =  re.sub(r'[@_!#$%^&*()<>?[\]./\\|}{~:-]', ' ', column_name)  # Removal of special characters
column_name = re.sub(r"[ ]{2,}", " ", column_name)  # Remove additional spaces
print(column_name)

branch manager


1. Find NER using Spacy
2. Find individual tokens in intermediate corpus
3. If not found in step 2, find semantically similar words in R^d 100 dimensional space
4. Future scope:
    % is removed as a character
    Certain columns containing '%' in the beginning or end are percentage columns
    Need to add the exception for %

In [199]:
# %%timeit

def model_based_ner(string: str) -> list:
    word = nlp(column_name)
    entity_list = []
    for token in word:
#         print(f"{token.ent_iob_} -> {token.ent_type_}")
        if token.ent_type_:
            entity_list.append(token.ent_type_)
    
    return entity_list

In [204]:
# %%timeit

def find_in_corpus(root_word: str, word_corpus: List[dict]) -> list:
    entity_list = []
    filtered_list = list(filter(lambda word_meta: word_meta["name"] == root_word, word_corpus))
    if filtered_list:
        for each_object in filtered_list:
            entity_list.append({each_object.get("name"): each_object.get("entity")})
    else:
        entity_list = []
    
    return entity_list

In [197]:
# %%timeit

def construct_corpus_matrix(word_corpus: List[dict], embedding_model):
    word_array = np.empty([len(word_corpus), embedding_model[embedding_model.index_to_key[0]].shape[0]])
    for row_number, root in enumerate(word_corpus):
        try:
            word_array[row_number, :] = embedding_model.get_vector(root.get("name"))
        except KeyError:
            word_array[row_number, :] = np.zeros([100,])
        
    return word_array

# corpus_array = construct_corpus_matrix(intermediate_corpus, model)

70.2 µs ± 381 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [205]:
# Step 1 
entity_list = model_based_ner(column_name)
print(entity_list)

# Step 2: Find 
if not entity_list:
    word_split = column_name.split(" ")
    # Step 2.a: Find sub_words in corpus
    entity_from_corpus = []
    for word in word_split:
        entity_from_corpus.extend(find_in_corpus(word, intermediate_corpus))
    # Step 2.b: Find if the last sub_word has returned an entity
    # Note: There can be 4 possibilities:
        # 1. All sub words can return entity
        # 2. Any sub_word other than the trailing sub_word returns an entity
        # 3. Any sub_word including the trailing sub_word returns an entity
        # 4. None of them return an entity
        
    print(entity_from_corpus)
    corpus_array = construct_corpus_matrix(intermediate_corpus, model)
    print(corpus_array.shape)


# corpus_array = construct_corpus_matrix(intermediate_corpus, model)
# corpus_array.shape

[]
[{'branch': 'Location'}, {'branch': 'Organisation'}]
(64, 100)
