# Entity Resolution on Unlabeled Text Data

#### By Ahsan Khan

In [2]:
# Set up Library imports

import pandas as pd
import re
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import jaccard_similarity_score

In [3]:
# Read csv files into Pandas dataframes

entities_df = pd.read_csv('C:\\Users\\Ahsan\\Desktop\\Capstone\\prototype\\Random entity test 1.csv')

targets_df = pd.read_csv('C:\\Users\\Ahsan\\Desktop\\Capstone\\prototype\All targets.csv')

# deep copy
intact_entities_df = entities_df.copy()
intact_targets_df = targets_df.copy()

In [4]:
entities_df

Unnamed: 0,Random entities
0,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
1,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
2,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
3,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
4,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
5,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
6,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
7,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
8,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...
9,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...


In [5]:
targets_df

Unnamed: 0,Target
0,Blackrock Advisors (UK) Limited
1,"BlackRock Financial Management, Inc."
2,Commingled Pension Trust Fund
3,Commingled Pension Trust Fund
4,Commingled Pension Trust Fund
5,Commingled Pension Trust Fund
6,Commingled Pension Trust Fund
7,Commingled Pension Trust Fund
8,Commingled Pension Trust Fund
9,Commingled Pension Trust Fund


### Preprocessing 

In the preprocessing state we are removing all punctuation, numbers and converting all text to lowercase. This will ensure uniformity in the corpus and all incoming entities

In [6]:
# Remove punctuation and numbers, and convert to lower case 

def preprocess(df):
    df.iloc[:,0] = df.iloc[:,0].apply(lambda x : str.lower(x)) # lower case
    df.iloc[:,0] = df.iloc[:,0].apply\
            (lambda x : " ".join(re.findall('[\w]+',x))) # remove punc, nums 
    return df

### Generating 3 character tokens (markov N-grams)

In [7]:
# A function for generating markov tokens 

remove_spaces = lambda s : "".join(s.split())

def split_3_char(s):
    if len(s) == 3:
        return s
    return s[:3] + ' ' + split_3_char(s[1:])

In [8]:
# Generating markov tokens of our dataframe

def tokenize_all(df):
    l = list(df.iloc[:,0])
    Tokens = []
    for w in l:
        w = remove_spaces(w)
        tok = split_3_char(w)
        Tokens.append(tok)
    return Tokens

### Checking jaccard similarty (scraped)

It was found that Jaccard similarity was not giving us the best match even though it is known best for text data. Say we have the token "ele" 3 repeat times in the title of an entity. Jaccard similarity will count it as just 1 "ele" and hence defeat the purpose. Cosine similarity will give us a better matching.

In [9]:
# jaccard similartiy function

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [10]:
# Running our functions on data

entities_df = preprocess(entities_df)
targets_df = preprocess(targets_df)
ent_corpus = tokenize_all(entities_df)
targ_corpus = tokenize_all(targets_df)
ent_size = len(ent_corpus)
corpus_ent_targ = ent_corpus + targ_corpus # A giant corpus containing both entities and targets markov tokens.

In [11]:
ent_corpus

['jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca ca

In [12]:
# Main corpus

corpus_ent_targ

['jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca cas ast str tru rus ust ste tee eeo eof ofb fbl bla lac ack ckr kro roc ock ckl kli lia iab abi bil ili lit ity tym yma mat atc tch chi hin ing ngf gfu fun und nds dsl sle lev eve ver era rag age ged eds dst stl tlg',
 'jpm pmo mor org rga gan anb nba ban ank nki kir ire rel ela lan and ndp dpl plc lca ca

## Cosine similariy


#### Making TF matrix (not tf-idf)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(corpus_ent_targ)

After applying tf-idf scores I noticed the 'idf' part was giving less weights to tokens that appeared more often in the corpus. In our case this is not what we want. We want all the tokens to be given equal weights as each token represents a chunk of the entity's title. Hence, just tf scoring and not tf-idf scoring is better in this case.

Now we have a sparse maxtrix of all tokens

In [15]:
print(tf)

TypeError: set_trace() takes 0 positional arguments but 1 was given

In [16]:
tf

<440x645 sparse matrix of type '<class 'numpy.int64'>'
	with 11683 stored elements in Compressed Sparse Row format>

Now finding cosine similarity of each entity with every target.`

In [47]:
# cosine on Tf

similarity_ent_targ = cosine_similarity(tf[0:ent_size], tf[ent_size:])
similarity_ent_targ # List of lists containing each entity and its weights

array([[0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       [0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       [0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       ...,
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176],
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176],
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176]])

In [48]:
# Index of target with max score for each entry.

targ_indexes = similarity_ent_targ.argmax(axis=1)
targ_indexes #Indexes of targets determined by argmax on cosine similarity weights

array([179, 179, 179, 179, 179, 179, 179, 179, 179, 179,   2,   2,   2,
         2,   2,   2,   2,   2,   2,   2,   2, 213, 213, 213, 216, 217,
       218,  18,  98,  31,  31,  18,  18,  18,  18, 227, 227, 227, 218,
       172,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
        18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18, 256,
       256, 256, 256], dtype=int64)

In [49]:
entities_df = intact_entities_df
targets_df = intact_targets_df
entities_series = entities_df.iloc[:,0]

In [50]:
# Get targets given indexes

associated_targets = targets_df.iloc[targ_indexes] 
associated_targets_series = associated_targets.iloc[:,0]
associated_targets_series = associated_targets_series.reset_index(drop=False)

In [51]:
# Concatenating final dataframe

df = pd.concat([entities_series, associated_targets_series], axis=1)

Creating a csv file

In [52]:
df.to_csv('C:\\Users\\Ahsan\\Desktop\\Capstone\\Prototype\\matched.csv', index=False)

## Output table of matched entities to its target

In [33]:
df

Unnamed: 0,Random entities,index,Target
0,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
1,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
2,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
3,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
4,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
5,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
6,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
7,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
8,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
9,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
