This will set up a series of functions to normalize text, strip stop words, and compute match scores.


In [1]:
# %%sh
# pip install fuzzywuzzy --user
# pip install python-Levenshtein --user
# pip install normalizr --user
# pip install jellyfish --user

In [2]:
#configure environment vars
# import sys
# import os
# sys.path.append(os.path.abspath("/home/ad.edap-cluster.com/dsmith04/.local/lib/python3.5/site-packages"))

In [3]:

import normalizr
from normalizr import Normalizr

# add words to stop word list as needed
stop_words_lst = ['INC', 'INCORPORATED', 'INCORP', 'CO', 'COMPANY', 'L L C', 'L L P', 'THE', 'LLC', 'LLP']

# regex to support stop word list
import re

#pandas and numpy for data frame
import pandas as pd
import numpy as np

# needed for cosine similarity
import math
from collections import Counter

#Difflib
import difflib
from difflib import SequenceMatcher

# Fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Jellyfish
import jellyfish


In [4]:
# Text Normalization
def norm(string):
    # convert to upper case
    normtxt=string.upper()
    normalizr = Normalizr(language='en')
    # define normalizations to apply
    normalizations = [
        ('replace_punctuation', {'replacement': ''}),
        'remove_accent_marks',
        ('replace_hyphens', {'replacement': ' '}),
        ('replace_symbols'),
        'remove_extra_whitespaces'
    ]
    #normalize
    normd=normalizr.normalize(normtxt,normalizations)
    # apply stop word list
    for w in stop_words_lst:
        pattern = r'\b'+w+r'\b'
        normd = re.sub(pattern, '', normd)
        
    normdd=normalizr.normalize(normd,normalizations)
    return pd.Series(normdd)

In [5]:
# Cosine Distance Function
def get_cosine(x):
     text1=x[3]
     text2=x[4]
     WORD = re.compile(r'\w+')
     vec1=Counter(WORD.findall(text1))
     vec2=Counter(WORD.findall(text2))  
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

In [6]:
# Difflib
def difflib(x):
    text1=x[3]
    text2=x[4]
    m = SequenceMatcher(None, text1, text2)
    return (m.ratio())

In [7]:
# FuzzyWuzzy
def fuzzratio(x):
    text1=x[3]
    text2=x[4]
    return (fuzz.ratio(text1, text2))

def fuzzpartialratio(x):
    text1=x[3]
    text2=x[4]
    return (fuzz.partial_ratio(text1, text2))

def fuzztokensortratio(x):
    text1=x[3]
    text2=x[4]
    return (fuzz.token_sort_ratio(text1, text2))

def fuzztokensetratio(x):
    text1=x[3]
    text2=x[4]
    return(fuzz.token_set_ratio(text1, text2))

In [8]:
def compute_jaccard_similarity_score(x):
    text1=x[3]
    text2=x[4]
    intersection_cardinality = len(set(text1).intersection(set(text2)))
    union_cardinality = len(set(text1).union(set(text2)))
    return intersection_cardinality / float(union_cardinality)

In [9]:
def levenshtein(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.levenshtein_distance(text1,text2)

def damerau_levenshtein(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.damerau_levenshtein_distance(text1,text2)

def jaro_distance(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.jaro_distance(text1,text2)

def jaro_winkler(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.jaro_winkler(text1,text2)

def j_match_rating(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.match_rating_comparison(text1,text2)

def hamming_distance(x):
    text1=x[3]
    text2=x[4]
    return jellyfish.hamming_distance(text1,text2)

def soundext1(x):
    text1=x[3]
    soundexenc = ''
    sentence=text1.split()
    for word in sentence:
        soundexenc = soundexenc+' '+jellyfish.soundex(word)
    return(soundexenc.strip())

def soundext2(x):
    text1=x[4]
    enc = ''
    sentence=text1.split()
    for word in sentence:
        enc = enc+' '+jellyfish.soundex(word)
    return(enc.strip())

def soundexcompare(x):
    text1=x[18]
    text2=x[19]
    return(text1==text2)

def metaphone1(x):
    text1=x[3]
    return(jellyfish.metaphone(text1))

def metaphone2(x):
    text1=x[4]
    return(jellyfish.metaphone(text1))

def metacompare(x):
    text1=x[21]
    text2=x[22]
    return(text1==text2)

def nysiis1(x):
    text1=x[3]
    enc = ''
    sentence=text1.split()
    for word in sentence:
        enc = enc+' '+jellyfish.nysiis(word)
    return(enc.strip())

def nysiis2(x):
    text1=x[4]
    enc = ''
    sentence=text1.split()
    for word in sentence:
        enc = enc+' '+jellyfish.nysiis(word)
    return(enc.strip())

def iiscompare(x):
    text1=x[24]
    text2=x[25]
    return(text1==text2)

In [10]:
df = pd.read_csv('companies.csv', header=None)
df.columns = ['Company_A', 'Company_B','Known_Match']
df['Norm_A'] = df['Company_A'].apply(norm)
df['Norm_B'] = df['Company_B'].apply(norm)
df['Cosine_Dist'] = df.apply(get_cosine, axis=1)
df['Difflib'] = df.apply(difflib, axis=1)
df['Fuzzratio'] = df.apply(fuzzratio, axis=1)
df['Fuzzpartialratio'] = df.apply(fuzzpartialratio, axis=1)
df['Fuzztokensortratio'] = df.apply(fuzztokensortratio, axis=1)
df['Fuzztokensetratio'] = df.apply(fuzztokensetratio, axis=1)
df['Jaccard'] = df.apply(compute_jaccard_similarity_score, axis=1)
df['Levenshtein'] = df.apply(levenshtein, axis=1)
df['Damerau_Levenshtein'] = df.apply(damerau_levenshtein, axis=1)
df['Jaro_Distance'] = df.apply(jaro_distance, axis=1)
df['Jaro_Winkler'] = df.apply(jaro_winkler, axis=1)
df['Jelly_Match_Rating'] = df.apply(j_match_rating, axis=1)
df['Hamming_Distance'] = df.apply(hamming_distance, axis=1)
df['Soundex_Text1'] = df.apply(soundext1, axis=1)
df['Soundex_Text2'] = df.apply(soundext2, axis=1)
df['Soundex_Compare'] = df.apply(soundexcompare, axis=1)
df['Metaphone_Text1'] = df.apply(metaphone1, axis=1)
df['Metaphone_Text2'] = df.apply(metaphone2, axis=1)
df['Metaphone_Compare'] = df.apply(metacompare, axis=1)
df['NYSIIS_Text1'] = df.apply(nysiis1, axis=1)
df['NYSIIS_Text2'] = df.apply(nysiis2, axis=1)
df['NYSIIS_Compare'] = df.apply(iiscompare, axis=1)
df

loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading
loading


Unnamed: 0,Company_A,Company_B,Known_Match,Norm_A,Norm_B,Cosine_Dist,Difflib,Fuzzratio,Fuzzpartialratio,Fuzztokensortratio,...,Hamming_Distance,Soundex_Text1,Soundex_Text2,Soundex_Compare,Metaphone_Text1,Metaphone_Text2,Metaphone_Compare,NYSIIS_Text1,NYSIIS_Text2,NYSIIS_Compare
0,test company file for pandas read,test company,1,TEST FILE FOR PANDAS READ,TEST,0.447214,0.275862,28,100,28,...,21,T230 F400 F600 P532 R300,T230,False,TST FL FR PNTS RT,TST,False,TAST FAL FAR PAND RAD,TAST,False
1,general electric,GenerAR Electric,1,GENERAL ELECTRIC,GENERAR ELECTRIC,0.5,0.9375,94,94,94,...,1,G564 E423,G566 E423,False,JNRL ELKTRK,JNRR ELKTRK,False,GANARAL ELACTRAC,GANARAR ELACTRAC,False
2,The General Electric Co.,GEnERaal Electric,1,GENERAL ELECTRIC,GENERAAL ELECTRIC,0.5,0.969697,97,94,97,...,11,G564 E423,G564 E423,True,JNRL ELKTRK,JNRL ELKTRK,True,GANARAL ELACTRAC,GANARAL ELACTRAC,True
3,Company A,Different Company B,0,A,DIFFERENT B,0.0,0.0,0,0,0,...,11,A000,D165 B000,False,A,TFRNT B,False,A,DAFARAD B,False
4,The cocoa bean company,cocoa bean inc,0,COCOA BEAN,COCOA BEAN,1.0,1.0,100,100,100,...,0,C200 B500,C200 B500,True,KK BN,KK BN,True,CAC BAN,CAC BAN,True
5,Somecompany L.L.C.,SomeCompany L L C,1,SOMECOMPANY,SOMECOMPANY,1.0,1.0,100,100,100,...,0,S525,S525,True,SMKMPN,SMKMPN,True,SANACANPANY,SANACANPANY,True
6,SomeCompany LLP,SomeCompany LLP.,1,SOMECOMPANY,SOMECOMPANY,1.0,1.0,100,100,100,...,0,S525,S525,True,SMKMPN,SMKMPN,True,SANACANPANY,SANACANPANY,True
7,ThisCompany LLC,ThatCompany Inc,0,THISCOMPANY,THATCOMPANY,0.0,0.818182,82,82,82,...,2,T251,T325,False,0SKMPN,0TKMPN,False,TASCANPANY,TATCANPANY,False
8,WidgetCo,WidgetCo L.L.C.,1,WIDGETCO,WIDGETCO,1.0,1.0,100,100,100,...,0,W323,W323,True,WJTK,WJTK,True,WADGATC,WADGATC,True
