# Entity Resolution on Unlabeled Text Data

#### By Ahsan Khan

In [2]:
# Set up Library imports

import pandas as pd
import re
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
#from sklearn.metrics import jaccard_similarity_score

In [3]:
# Read csv files into Pandas dataframes

entities_df = pd.read_csv('All entities.csv')

targets_df = pd.read_csv('All targets.csv')

# deep copy
intact_entities_df = entities_df.copy()
intact_targets_df = targets_df.copy()

In [4]:
entities_df

Unnamed: 0,All entities
0,BLACKROCK ADV UKLTD-MORGAN-AGG
1,BLACKROCK FIN MG AAF-MORGANTRN
2,Commingled Pension Trust Fund (Core Bond) of J...
3,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
4,COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
...,...
367,MORGAN STANLEY & CO INTERNATIONAL PLC
368,MORGAN STANLEY & CO INTL P-GBR
369,MORGAN STANLEY & CO INTL P-GBR
370,MORGAN STANLEY & CO INTL P-IOS


In [8]:
targets_df

Unnamed: 0,Target
0,Blackrock Advisors (UK) Limited
1,"BlackRock Financial Management, Inc."
2,Commingled Pension Trust Fund
3,Commingled Pension Trust Fund
4,Commingled Pension Trust Fund
...,...
367,Morgan Stanley & Co. International plc
368,Morgan Stanley & Co. International plc
369,Morgan Stanley & Co. International plc
370,Morgan Stanley & Co. International plc


### Preprocessing 

In the preprocessing state we are removing all punctuation, numbers and converting all text to lowercase. This will ensure uniformity in the corpus and all incoming entities

In [9]:
# Remove punctuation and numbers, and convert to lower case 

def preprocess(df):
    df.iloc[:,0] = df.iloc[:,0].apply(lambda x : str.lower(x)) # lower case
    df.iloc[:,0] = df.iloc[:,0].apply\
            (lambda x : " ".join(re.findall('[\w]+',x))) # remove punc, nums 
    return df

In [15]:
entities_df.iloc[:,0]

0                         BLACKROCK ADV UKLTD-MORGAN-AGG
1                         BLACKROCK FIN MG AAF-MORGANTRN
2      Commingled Pension Trust Fund (Core Bond) of J...
3      COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
4      COMMINGLED PENSION TRUST FUND (CORE PLUS BOND)...
                             ...                        
367                MORGAN STANLEY & CO INTERNATIONAL PLC
368                       MORGAN STANLEY & CO INTL P-GBR
369                       MORGAN STANLEY & CO INTL P-GBR
370                       MORGAN STANLEY & CO INTL P-IOS
371                         MORGAN STANLEY & CO IN11-LCW
Name: All entities, Length: 372, dtype: object

### Generating 3 character tokens (markov N-grams)

In [10]:
# A function for generating markov tokens 

remove_spaces = lambda s : "".join(s.split())

def split_3_char(s):
    if len(s) == 3:
        return s
    return s[:3] + ' ' + split_3_char(s[1:])

In [11]:
# Generating markov tokens of our dataframe

def tokenize_all(df):
    l = list(df.iloc[:,0])
    Tokens = []
    for w in l:
        w = remove_spaces(w)
        tok = split_3_char(w)
        Tokens.append(tok)
    return Tokens

### Checking jaccard similarty (scraped)

It was found that Jaccard similarity was not giving us the best match even though it is known best for text data. Say we have the token "ele" 3 repeat times in the title of an entity. Jaccard similarity will count it as just 1 "ele" and hence defeat the purpose. Cosine similarity will give us a better matching.

In [16]:
# jaccard similartiy function

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

## Preprocessing

In [18]:
# Running our functions on data

entities_df = preprocess(entities_df)
targets_df = preprocess(targets_df)

In [20]:
entities_df

Unnamed: 0,All entities
0,blackrock adv ukltd morgan agg
1,blackrock fin mg aaf morgantrn
2,commingled pension trust fund core bond of jpm...
3,commingled pension trust fund core plus bond o...
4,commingled pension trust fund core plus bond o...
...,...
367,morgan stanley co international plc
368,morgan stanley co intl p gbr
369,morgan stanley co intl p gbr
370,morgan stanley co intl p ios


In [21]:
targets_df

Unnamed: 0,Target
0,blackrock advisors uk limited
1,blackrock financial management inc
2,commingled pension trust fund
3,commingled pension trust fund
4,commingled pension trust fund
...,...
367,morgan stanley co international plc
368,morgan stanley co international plc
369,morgan stanley co international plc
370,morgan stanley co international plc


## Tokenization 

In [23]:
ent_corpus = tokenize_all(entities_df)
targ_corpus = tokenize_all(targets_df)

In [30]:
targ_corpus[3]

'com omm mmi min ing ngl gle led edp dpe pen ens nsi sio ion ont ntr tru rus ust stf tfu fun und'

In [31]:
ent_corpus[3]

'com omm mmi min ing ngl gle led edp dpe pen ens nsi sio ion ont ntr tru rus ust stf tfu fun und ndc dco cor ore rep epl plu lus usb sbo bon ond ndo dof ofj fjp jpm pmo mor org rga gan anc nch cha has ase seb eba ban ank nkn kna'

In [32]:
ent_size = len(ent_corpus)
corpus_ent_targ = ent_corpus + targ_corpus # A giant corpus containing both entities and targets markov tokens.

In [43]:
ent_size

372

In [42]:
# Main corpus

len(corpus_ent_targ)

744

## Cosine similariy


#### Making TF matrix (not tf-idf)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
tf = vectorizer.fit_transform(corpus_ent_targ)

After applying tf-idf scores I noticed the 'idf' part was giving less weights to tokens that appeared more often in the corpus. In our case this is not what we want. We want all the tokens to be given equal weights as each token represents a chunk of the entity's title. Hence, just tf scoring and not tf-idf scoring is better in this case.

Now we have a sparse maxtrix of all tokens

In [39]:
print(tf)

  (0, 130)	1
  (0, 461)	1
  (0, 59)	1
  (0, 165)	1
  (0, 457)	1
  (0, 726)	1
  (0, 631)	1
  (0, 162)	1
  (0, 447)	1
  (0, 64)	1
  (0, 239)	1
  (0, 948)	1
  (0, 911)	1
  (0, 454)	1
  (0, 523)	1
  (0, 833)	1
  (0, 220)	1
  (0, 547)	1
  (0, 656)	1
  (0, 712)	1
  (0, 355)	1
  (0, 86)	1
  (0, 554)	1
  (0, 68)	1
  (1, 130)	1
  :	:
  (743, 427)	1
  (743, 121)	1
  (743, 857)	1
  (743, 425)	1
  (743, 612)	1
  (743, 845)	1
  (743, 97)	1
  (743, 679)	1
  (743, 610)	1
  (743, 806)	1
  (743, 826)	1
  (743, 95)	1
  (743, 603)	1
  (743, 487)	1
  (743, 329)	1
  (743, 558)	1
  (743, 643)	1
  (743, 555)	1
  (743, 961)	1
  (743, 300)	1
  (743, 725)	1
  (743, 80)	1
  (743, 172)	1
  (743, 638)	1
  (743, 520)	1


In [41]:
tf

<744x976 sparse matrix of type '<class 'numpy.int64'>'
	with 20487 stored elements in Compressed Sparse Row format>

Now finding cosine similarity of each entity with every target.`

In [47]:
# cosine on Tf

similarity_ent_targ = cosine_similarity(tf[0:ent_size], tf[ent_size:])
similarity_ent_targ # List of lists containing each entity and its weights

array([[0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       [0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       [0.18856181, 0.17153801, 0.14142136, ..., 0.10721125, 0.10721125,
        0.10721125],
       ...,
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176],
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176],
       [0.        , 0.        , 0.        , ..., 0.61588176, 0.61588176,
        0.61588176]])

In [48]:
# Index of target with max score for each entry.

targ_indexes = similarity_ent_targ.argmax(axis=1)
targ_indexes #Indexes of targets determined by argmax on cosine similarity weights

array([179, 179, 179, 179, 179, 179, 179, 179, 179, 179,   2,   2,   2,
         2,   2,   2,   2,   2,   2,   2,   2, 213, 213, 213, 216, 217,
       218,  18,  98,  31,  31,  18,  18,  18,  18, 227, 227, 227, 218,
       172,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
        18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18, 256,
       256, 256, 256], dtype=int64)

In [49]:
entities_df = intact_entities_df
targets_df = intact_targets_df
entities_series = entities_df.iloc[:,0]

In [50]:
# Get targets given indexes

associated_targets = targets_df.iloc[targ_indexes] 
associated_targets_series = associated_targets.iloc[:,0]
associated_targets_series = associated_targets_series.reset_index(drop=False)

In [51]:
# Concatenating final dataframe

df = pd.concat([entities_series, associated_targets_series], axis=1)

Creating a csv file

In [52]:
df.to_csv('C:\\Users\\Ahsan\\Desktop\\Capstone\\Prototype\\matched.csv', index=False)

## Output table of matched entities to its target

In [33]:
df

Unnamed: 0,Random entities,index,Target
0,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
1,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
2,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
3,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
4,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
5,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
6,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
7,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
8,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
9,JP MORGAN BANK (IRELAND) PLC AS TRUSTEE OF BLA...,179,J.P. Morgan Bank (Ireland) plc
