# Test Matching with different score_cutoff

In [1]:
import pandas as pd
import mwdsbe
import schuylkill as skool
import time

In [2]:
registry = mwdsbe.load_registry() # geopandas df

In [3]:
gf = pd.read_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\mwdsbe\data\cwedp_37_report.xlsx', sheet_name='general_funds')

In [4]:
len(registry)

3119

In [5]:
len(gf)

324213

In [6]:
# filter gf with unrevelant maj_class and na vendor name
gf = gf.loc[gf['MAJ_CLASS'] != 1]
gf = gf.loc[gf['VEND_NAME'].dropna().index]
len(gf)

243375

In [7]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_gf = skool.clean_strings(gf, ['VEND_NAME'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_gf = cleaned_gf.dropna(subset=['VEND_NAME'])

## Fuzz 95 + TFIDF 85
* Number of wrong matches: 6
* Highest score_cutoff of wrong matches: 0.916

In [34]:
t1 = time.time()
merged3 = (
    skool.fuzzy_merge(cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=85)
)
t = time.time() - t1

In [35]:
print('Execution time:', t/60, 'min')

Execution time: 22.648447954654692 min


In [36]:
matched3 = merged3.dropna(subset=['VEND_NAME'])

In [37]:
len(matched3)

136

In [38]:
simple_matched3 = matched3[['company_name', 'dba_name', 'match_probability', 'VEND_NAME']]

In [39]:
simple_matched3.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\offset\fuzz95+tfidf85.xlsx', header=True)

## Fuzz 95 + TFIDF 90

In [15]:
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=90)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=90)
)
t = time.time() - t1

In [16]:
print('Execution time:', t/60, 'min')

Execution time: 22.16238275766373 min


In [17]:
matched = merged.dropna(subset=['VEND_NAME'])

In [23]:
len(matched)

129

In [24]:
simple_matched = matched[['company_name', 'dba_name', 'match_probability', 'VEND_NAME']]

In [26]:
simple_matched.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\offset\fuzz95_tfidf90.xlsx', header=True)

Fuzz95+TFIDF90 has less matches than Fuzz95+TFIDF90-wrong matches=136-6=130 

In [None]:
set1=set(t85['company_name'])
set2=set(matched['company_name'])

## Fuzz 95 + TFIDF 95

In [19]:
t1 = time.time()
merged2 = (
    skool.fuzzy_merge(cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=95)
)
t = time.time() - t1

In [20]:
print('Execution time:', t/60, 'min')

Execution time: 22.20575442711512 min


In [21]:
matched2 = merged2.dropna(subset=['VEND_NAME'])

In [27]:
len(matched2)

124

In [28]:
simple_matched2 = matched2[['company_name', 'dba_name', 'match_probability', 'VEND_NAME']]

In [29]:
simple_matched2.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\offset\fuzz95_tfidf95.xlsx', header=True)

## Best Score_cutoff 0.913

In [50]:
best_matched = matched3.loc[simple_matched3['match_probability'] > 0.913]

In [51]:
len(best_matched)

127

In [52]:
best_matched.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\fuzz95_tfidf913.xlsx', header=True)

## Fuzz 95 + TFIDF 80

In [53]:
t1 = time.time()
merged4 = (
    skool.fuzzy_merge(cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="company_name", right_on="VEND_NAME", score_cutoff=80)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_gf, left_on="dba_name", right_on="VEND_NAME", score_cutoff=80)
)
t = time.time() - t1

In [54]:
print('Execution time:', t/60, 'min')

Execution time: 22.857625369230906 min


In [56]:
matched4 = merged4.dropna(subset=['VEND_NAME'])

In [57]:
len(matched4)

150

In [58]:
simple_matched4 = matched4[['company_name', 'dba_name', 'match_probability', 'VEND_NAME']]

In [59]:
simple_matched4.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\offset\fuzz95_tfidf80.xlsx', header=True)

In [60]:
matched4.to_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\fuzz95_tfidf80.xlsx', header=True)