# Comparing Company Names using TF-IDF/ N-grams / cosine similarity

# Table of Contents

1. [Data](#data)
2. [Functions](#functions)
3. [Matching with TF-IDF](#tfidf)
4. [Comparing between fuzzy and TF-IDF](#compare)
    1. [Fuzzy match with score-cutoff 95](#fuzz95)
    2. [TF-IDF with score-cutoff 90](#tfidf90)
    3. [TF-IDF with score-cutoff 85](#tfidf85)
    4. [TF-IDF with score-cutoff 80](#tfidf80)
5. [Test Cosine Similary with sklearn](#cossimsklarn)

<a id="data"></a>
## Data

In [2]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import pandas as pd
import numpy as np

In [3]:
# import registry
registry = mwdsbe.load_registry() # geopandas df

In [4]:
np.shape(registry)

(3119, 20)

In [5]:
registry.head()

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,mailing_state,mailing_zip,certification_type,capability,local,out_of_state,location_standard,lat,lng,geometry
registry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,119 Degrees Architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,MBE,"NAICS 5413 Architectural, Engineering, and Rel...",True,False,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304190105227 39.96427495800303)
1,12Bravo Group,,JEFFREY,YEKENCHIK,236 McKendimen Road,Medford Lakes,NJ,8055.0,236 McKendimen Road,Medford Lakes,NJ,8055.0,DSBE,"Addition, alteration and renovation for-sale b...",False,True,,,,
2,1st Choice Financial Group,ProVisio,Kathrina,Nease,133 N. 21st Street,Camp Hill,PA,17011.0,133 N. 21st Street,Camp Hill,PA,17011.0,WBE,NAICS 928120 International Affairs,False,False,,,,
3,212 Harakawa Inc.,Two Twelve,Ann,Harakawa,"236 W 27th Street, Suite 802",New York,NY,10001.0,"236 W 27th Street, Suite 802",New York,NY,10001.0,MWBE,Graphic Design Services; Graphic design servic...,False,True,,,,
4,215 Media Solutions,,Dewain,Johnson,810 Felton Avenue,Sharon Hill,PA,19079.0,810 Felton Avenue,Sharon Hill,PA,19079.0,MBE,NAICS 5414 Specialized Design Services ; NAICS...,False,False,,,,


In [6]:
# import license data
license = licenses.CommercialActivityLicenses().download()

In [7]:
np.shape(license)

(203479, 4)

In [8]:
license.head()

Unnamed: 0,license_num,issue_date,license_status,company_name
0,188053,1990-01-01T00:00:00Z,Active,BIRMINGHAM FIRE INS CO OF PA T
1,58781,1990-01-12T00:00:00Z,Active,CLAYMAN EDWARD P ESQ
2,57406,1990-01-23T00:00:00Z,Active,BRENNAN J F
3,332055,1990-02-08T00:00:00Z,Active,TASTY BAKING COMPANY
4,188058,1990-12-31T00:00:00Z,Active,MAGARGEE BROS INC


In [9]:
# clean_license = clean_license.drop_duplicates(subset='company_name', keep='first', inplace=False)

NameError: name 'clean_license' is not defined

<a id="functions"></a>
## Functions

In [13]:
# N-Grams
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [14]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [15]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# company_names = clean_mini_registry['company_name']
# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
# tf_idf_matrix = vectorizer.fit_transform(company_names)

In [16]:
# company_names_2 = clean_license['company_name']
# company_names_2 = company_names_2.dropna()

In [17]:
# tf_idf_matrix_2 = vectorizer.fit_transform(company_names_2)

In [18]:
# print(tf_idf_matrix[0])

In [19]:
# from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# cosine_similarity(tf_idf_matrix[0], tf_idf_matrix_2) # dimension error

In [21]:
# test for cosine similarity
# data1 = ['apple', 'qple', 'applr', 'aple', 'grape']

In [22]:
# tf_idf_matrix_test = vectorizer.fit_transform(data1)

In [23]:
# matches_test = awesome_cossim_top(tf_idf_matrix_test, tf_idf_matrix_test, 2, 0.8)

In [24]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

<a id="tfidf"></a>
## Matching with TF-IDF

In [24]:
# get_matches_df(matches_test, tf_idf_matrix_test)

In [181]:
# test combine 2 datasets and apply TF-IDF and cosine similarity approach
combined = clean_mini_registry.append(clean_license, sort=False)

In [182]:
company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [124]:
# print(tf_idf_matrix[0])
# company_names

In [184]:
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.9)

In [185]:
matches_df = get_matches_df(matches, company_names, top=10000)

In [125]:
# matches_df[:10]

In [81]:
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches

In [29]:
# test dataset with size 100
mini_registry = registry[:100] # for testing purpose

# clean company_name and dba_name of clean datasets
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd']
clean_mini_registry = skool.clean_strings(mini_registry, ['company_name', 'dba_name'], True, ignore)
clean_license = skool.clean_strings(license, ['company_name'], True, ignore)

In [30]:
# test combine 2 datasets and apply TF-IDF and cosine similarity approach
combined = clean_mini_registry.append(clean_license, sort=False)

# transform to matrix
company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

# calculate cosine similarity
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.9)

In [139]:
matches_df = get_matches_df(matches, company_names, top=1000)

In [135]:
# matches_df[-10:]

In [187]:
# last company name of clean_mini_registry
matches_df.loc[matches_df['left_side'] == 'advanced integration'] # this should be the cutting line of matrix mult

Unnamed: 0,left_side,right_side,similairity
142,advanced integration,advanced integration,1.0
143,advanced integration,advanced integration,1.0


In [188]:
matches_df = matches_df[:144]

In [189]:
matches_df[40:50]

Unnamed: 0,left_side,right_side,similairity
40,a m e mechanical,a m e mechanical,1.0
41,a m e mechanical,a m e mechanical,1.0
42,a g consulting engineering,a g consulting engineering,1.0
43,a l jackson company p a,a l jackson company p a,1.0
44,a lee cook hauling,a lee cook hauling,1.0
45,a lee cook hauling,a lee cook hauling,1.0
46,a m electric,a m electric,1.0
47,a m electric,d a m electric,0.911687
48,a m painting,a m painting,1.0
49,a c advisory,a c advisory,1.0


In [190]:
filter_matches = matches_df.duplicated(subset='left_side', keep='first') # all duplicates except their first occurrence will be marked as True

In [191]:
filter_matches = matches_df[filter_matches]

In [192]:
filter_matches

Unnamed: 0,left_side,right_side,similairity
1,119 degrees architects,119 degrees architects,1.0
8,24 hour cleaning services,24 hour cleaning service,0.984436
10,259 strategies,259 strategies,1.0
15,4u services,4u services,1.0
19,521 management,521 management,1.0
21,6 degrees consulting,6 degrees consulting,1.0
23,84 lumber company,84 lumber company,1.0
25,a a court reporting,a a court reporting,1.0
29,a c environmental services,a c environmental services,1.0
31,a i security,a i security,1.0


In [193]:
len(filter_matches)

45

In [194]:
len(filter_matches) == len(np.unique(filter_matches['left_side'])) # all unique

True

<a id="compare"></a>
## Comparing between fuzzy and TF-IDF

<a id="fuzz95"></a>
### Fuzzy match with score-cutoff 95 (best score-cutoff)

In [32]:
mini_registry = registry[:100]

# clean company_name and dba name
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore)
cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name', 'dba_name'], True, ignore)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore)

cleaned_mini_registry = cleaned_mini_registry.drop_duplicates(subset='company_name', keep='first', inplace=False)
cleaned_license = cleaned_license.drop_duplicates(subset='company_name', keep='first', inplace=False)

In [78]:
print('Size of mini_registry data: ', len(cleaned_mini_registry))
print('Size of license data: ', len(cleaned_license))

Size of mini_registry data:  100
Size of license data:  194686


In [39]:
# fuzzy on company_name and dba_name with 95 score-cutoff
# match company_name and dba_name
import time
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_mini_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
)
t = time.time() - t1
matched = merged.dropna(subset=['company_name_y'])

In [40]:
print('Fuzzywuzzy match: ', len(matched), 'out of 100')

Fuzzywuzzy match:  49 out of 100


In [41]:
print('Execution fuzzywuzzy match: ', t/60, ' minute')

Execution fuzzywuzzy match:  0.5492017984390258  minute


In [82]:
matched[['company_name_x', 'company_name_y', 'match_probability']]

Unnamed: 0,company_name_x,company_name_y,match_probability
0,119 degrees architects,119 degrees architects,1.0
5,22,22,1.0
7,24 hour cleaning services,24 hour cleaning service,0.98
8,259 strategies,259 strategies,1.0
12,4u services,4u services,1.0
15,521 management,521 management,1.0
16,6 degrees consulting,6 degrees consulting,1.0
17,84 lumber company,84 lumber company,1.0
18,a a court reporting,a a court reporting,1.0
21,a c environmental services,a c environmental services,1.0


<a id="tfidf90"></a>
### TF-IDF with score-cutoff 90

In [33]:
# TF-IDF on company_name with 90 score-cutoff
combined = cleaned_mini_registry.append(cleaned_license, sort=False)

# transform to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

# calculate cosine similarity
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.9)
t = time.time() - t1

In [34]:
print('Execution time for cossim 90: ', t/60, ' minute')

Execution time for cossim 90:  1.5641103386878967  minute


In [35]:
matches_df = get_matches_df(matches, company_names, top=1000)

In [36]:
matches

<194830x194830 sparse matrix of type '<class 'numpy.float64'>'
	with 200648 stored elements in Compressed Sparse Row format>

In [119]:
# last company name of clean_mini_registry
last_index = len(cleaned_mini_registry) - 1
last_company_name = cleaned_mini_registry.loc[last_index, 'company_name']
last_company_index_to_consider = matches_df.loc[matches_df['left_side'] == last_company_name].index.tolist()[-1] # this should be the cutting line of matrix mult

In [120]:
matches_df = matches_df[:last_company_index_to_consider + 1]

filter_matches = matches_df.duplicated(subset='left_side', keep='first') # all duplicates except their first occurrence will be marked as True

filter_matches = matches_df[filter_matches]

print('TF-IDF match: ', len(filter_matches), ' out of ', len(cleaned_mini_registry))

TF-IDF match:  45  out of  100


In [121]:
filter_matches

Unnamed: 0,left_side,right_side,similairity
1,119 degrees architects,119 degrees architects,1.0
8,24 hour cleaning services,24 hour cleaning service,0.98448
10,259 strategies,259 strategies,1.0
15,4u services,4u services,1.0
19,521 management,521 management,1.0
21,6 degrees consulting,6 degrees consulting,1.0
23,84 lumber company,84 lumber company,1.0
25,a a court reporting,a a court reporting,1.0
29,a c environmental services,a c environmental services,1.0
31,a i security,a i security,1.0


In [122]:
# Explore different companies result between fuzzy match(matched) and TDIDF match(filter_matches)
fuzzy_matched = matched['company_name_x']
tfidf_matched = filter_matches['left_side']
pd.concat([fuzzy_matched, tfidf_matched]).drop_duplicates(keep=False)

5                                 22
25                    a bob s towing
35                      a m painting
40               a v rental services
58                  ac s contractors
89                   adg enterprises
58     aaaa office warehouse surplus
125        adept consulting services
dtype: object

<a id="tfidf80"></a>
### TF-IDF with different score-cutoff 80

In [88]:
# TF-IDF on company_name with 80 score-cutoff
combined = cleaned_mini_registry.append(cleaned_license, sort=False)

# transform to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

# calculate cosine similarity
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.8)
t = time.time() - t1

In [89]:
print('Time for calculating cosine similarity: ', t/60, ' minute')

Time for calculating cosine similarity:  1.6015166680018107  minute


In [90]:
matches_df = get_matches_df(matches, company_names, top=1000)

In [91]:
# last company name of clean_mini_registry
last_index = len(cleaned_mini_registry) - 1
last_company_name = cleaned_mini_registry.loc[last_index, 'company_name']
last_company_index_to_consider = matches_df.loc[matches_df['left_side'] == last_company_name].index.tolist()[-1] # this should be the cutting line of matrix mult

In [92]:
matches_df = matches_df[:last_company_index_to_consider + 1]

filter_matches = matches_df.duplicated(subset='left_side', keep='first') # all duplicates except their first occurrence will be marked as True

filter_matches = matches_df[filter_matches]

print('TF-IDF match: ', len(filter_matches), ' out of ', len(cleaned_mini_registry))

TF-IDF match:  56  out of  100


In [93]:
filter_matches

Unnamed: 0,left_side,right_side,similairity
1,119 degrees architects,119 degrees architects,1.0
8,24 hour cleaning services,24 hour cleaning service,0.98448
10,259 strategies,259 strategies,1.0
15,4u services,4u services,1.0
19,521 management,521 management,1.0
21,6 degrees consulting,6 degrees consulting,1.0
23,84 lumber company,84 lumber company,1.0
25,a a court reporting,a a court reporting,1.0
29,a c environmental services,a c environmental services,1.0
31,a i security,a i security,1.0


<a id="tfidf85"></a>
### TF-IDF with different score-cutoff 85

In [31]:
# TF-IDF on company_name with 80 score-cutoff
combined = cleaned_mini_registry.append(cleaned_license, sort=False)

# transform to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

# calculate cosine similarity
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.85)
t = time.time() - t1

NameError: name 'cleaned_mini_registry' is not defined

In [111]:
print('Execution time for cossim 85: ', t/60, ' minute')

Execution time for cossim 85:  1.6140947222709656  minute


In [112]:
matches_df = get_matches_df(matches, company_names, top=1000)

In [113]:
# last company name of clean_mini_registry
last_index = len(cleaned_mini_registry) - 1
last_company_name = cleaned_mini_registry.loc[last_index, 'company_name']
last_company_index_to_consider = matches_df.loc[matches_df['left_side'] == last_company_name].index.tolist()[-1] # this should be the cutting line of matrix mult

In [114]:
matches_df = matches_df[:last_company_index_to_consider + 1]

filter_matches = matches_df.duplicated(subset='left_side', keep='first') # all duplicates except their first occurrence will be marked as True

filter_matches = matches_df[filter_matches]

print('TF-IDF match: ', len(filter_matches), ' out of ', len(cleaned_mini_registry))

TF-IDF match:  46  out of  100


In [115]:
filter_matches

Unnamed: 0,left_side,right_side,similairity
1,119 degrees architects,119 degrees architects,1.0
8,24 hour cleaning services,24 hour cleaning service,0.98448
10,259 strategies,259 strategies,1.0
15,4u services,4u services,1.0
19,521 management,521 management,1.0
21,6 degrees consulting,6 degrees consulting,1.0
23,84 lumber company,84 lumber company,1.0
25,a a court reporting,a a court reporting,1.0
29,a c environmental services,a c environmental services,1.0
31,a i security,a i security,1.0


## Full Dataset with TF-IDF

In [10]:
all_company_names = pd.concat([registry['company_name'].dropna(), license['company_name'].dropna()]).unique()

In [11]:
len(all_company_names)

201279

In [25]:
# transform to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_company_names)

# calculate cosine similarity
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 2, 0.85)
t = time.time() - t1

In [26]:
print('Execution time for cossim 85: ', t, ' sec')

Execution time for cossim 85:  140.76658654212952  sec


In [27]:
matches

<201279x201279 sparse matrix of type '<class 'numpy.float64'>'
	with 214803 stored elements in Compressed Sparse Row format>

In [28]:
matches_df = get_matches_df(matches, all_company_names, top=1000000)

IndexError: index 214803 is out of bounds for axis 0 with size 214803

In [113]:
# last company name of clean_mini_registry
last_index = len(cleaned_mini_registry) - 1
last_company_name = cleaned_mini_registry.loc[last_index, 'company_name']
last_company_index_to_consider = matches_df.loc[matches_df['left_side'] == last_company_name].index.tolist()[-1] # this should be the cutting line of matrix mult

In [114]:
matches_df = matches_df[:last_company_index_to_consider + 1]

filter_matches = matches_df.duplicated(subset='left_side', keep='first') # all duplicates except their first occurrence will be marked as True

filter_matches = matches_df[filter_matches]

print('TF-IDF match: ', len(filter_matches), ' out of ', len(cleaned_mini_registry))

TF-IDF match:  46  out of  100


<a id="cossimsklarn"></a>
## Test Cosine Similary with sklearn

In [151]:
# from sklearn.metrics.pairwise import cosine_similarity
# t1 = time.time()
# cosine_similarity(tf_idf_matrix, tf_idf_matrix)
# t = time.time() - t1

In [94]:
############################################################

In [141]:
# TF-IDF on company_name with 80 score-cutoff
combined = cleaned_mini_registry.append(cleaned_license, sort=False)

# transform to matrix
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = combined['company_name'].values.astype('U')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

# calculate cosine similarity
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85)
t = time.time() - t1

In [144]:
print('Execution time for cossim 90:', t, 'sec')

Execution time for cossim 90: 99.12980008125305 sec


In [149]:
# time for fuzzywuzzy for whole registry data

# clean company_name and dba name
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore)

cleaned_registry = cleaned_registry.drop_duplicates(subset='company_name', keep='first', inplace=False)

In [150]:
import time
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_mini_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
)
t = time.time() - t1
matched = merged.dropna(subset=['company_name_y'])

print('Fuzzywuzzy match: ', len(matched), 'out of', len(cleaned_registry))

print('Execution fuzzywuzzy match: ', t/60, ' minute')

Fuzzywuzzy match:  1353 out of 3117
Execution fuzzywuzzy match:  14.572765370210012  minute


In [133]:
###########################################################

In [95]:
# def filterMatches(df):
#     for i in range(1, len(df)):
#         print(matches_df[i]['left_side'])

In [96]:
# filterMatches(matches_df)

In [97]:
# matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
# len(matches_df)

In [98]:
# matches_df.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\TFIDF.xlsx', index = None, header=True)

In [99]:
# mini_combined = clean_mini_registry.append(clean_license[:50], sort=False)

In [100]:
# company_names = mini_combined['company_name']
# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
# tf_idf_matrix = vectorizer.fit_transform(company_names.values.astype('U'))

In [101]:
# matches = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

In [102]:
# matches_df = get_matches_df(matches, company_names) # subview error

In [103]:
# matches_df

In [104]:
# matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
# matches_df

## Comparing two different datasets using K nearst N

In [105]:
# Transform messy data into tdidf matrix

In [106]:
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.feature_extraction.text import TfidfVectorizer

# # clean data: clean_mini_registry
# clean_names = clean_mini_registry['company_name']
# # messy data: clean_license

# vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
# tfidf = vectorizer.fit_transform(clean_names)

# from sklearn.neighbors import NearestNeighbors
# nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

# messy_names = clean_license['company_name'].values

# ### matching query:
# def getNearestN(query):
#   queryTFIDF_ = vectorizer.transform(query)
#   distances, indices = nbrs.kneighbors(queryTFIDF_)
#   return distances, indices

# distances, indices = getNearestN(messy_names.astype('U')) # getting nearest n

# # creating df
# index = [i for i in range(5)]
# columns = ['score', 'clean', 'messy']
# matches = pd.DataFrame(index=index, columns=columns)

# for i,j in enumerate(indices):
#     j = j[0]
#     matches[j]['score'] = round(distances[i][0],2)
#     matches[j]['clean'] = clean_names.values[j][0][0]
#     matches[j]['messy'] = messy_names[i]

In [126]:
# matches[0][0] = 0

In [108]:
#matches[matches['Match confidence (lower is better)'] < 0.1]

In [109]:
# len(clean_names)