# TF-IDF

In [1]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import pandas as pd
import numpy as np

In [2]:
# import registry
registry = mwdsbe.load_registry() # geopandas df

In [3]:
registry.head()

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,mailing_state,mailing_zip,certification_type,capability,local,out_of_state,location_standard,lat,lng,geometry
registry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,119 Degrees Architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,MBE,"NAICS 5413 Architectural, Engineering, and Rel...",True,False,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304190105227 39.96427495800303)
1,12Bravo Group,,JEFFREY,YEKENCHIK,236 McKendimen Road,Medford Lakes,NJ,8055.0,236 McKendimen Road,Medford Lakes,NJ,8055.0,DSBE,"Addition, alteration and renovation for-sale b...",False,True,,,,
2,1st Choice Financial Group,ProVisio,Kathrina,Nease,133 N. 21st Street,Camp Hill,PA,17011.0,133 N. 21st Street,Camp Hill,PA,17011.0,WBE,NAICS 928120 International Affairs,False,False,,,,
3,212 Harakawa Inc.,Two Twelve,Ann,Harakawa,"236 W 27th Street, Suite 802",New York,NY,10001.0,"236 W 27th Street, Suite 802",New York,NY,10001.0,MWBE,Graphic Design Services; Graphic design servic...,False,True,,,,
4,215 Media Solutions,,Dewain,Johnson,810 Felton Avenue,Sharon Hill,PA,19079.0,810 Felton Avenue,Sharon Hill,PA,19079.0,MBE,NAICS 5414 Specialized Design Services ; NAICS...,False,False,,,,


In [5]:
# import license data
license = licenses.CommercialActivityLicenses.download()

In [6]:
license.head()

Unnamed: 0,license_num,issue_date,license_status,company_name
0,188053,1990-01-01T00:00:00Z,Active,BIRMINGHAM FIRE INS CO OF PA T
1,58781,1990-01-12T00:00:00Z,Active,CLAYMAN EDWARD P ESQ
2,57406,1990-01-23T00:00:00Z,Active,BRENNAN J F
3,332055,1990-02-08T00:00:00Z,Active,TASTY BAKING COMPANY
4,188058,1990-12-31T00:00:00Z,Active,MAGARGEE BROS INC


In [6]:
mini_registry = registry[:5]

In [7]:
# clean company_name and dba_name of clean datasets
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd']
registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore)
license = skool.clean_strings(license, ['company_name'], True, ignore)

In [8]:
from ftfy import fix_text

ModuleNotFoundError: No module named 'ftfy'

In [12]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [17]:
ngrams(registry['company_name'].iloc[0])

['119',
 '19 ',
 '9 d',
 ' de',
 'deg',
 'egr',
 'gre',
 'ree',
 'ees',
 'es ',
 's a',
 ' ar',
 'arc',
 'rch',
 'chi',
 'hit',
 'ite',
 'tec',
 'ect',
 'cts']

In [9]:
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

In [10]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = registry['company_name'].iloc[:100]
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [14]:
tf_idf_matrix

<100x852 sparse matrix of type '<class 'numpy.float64'>'
	with 1740 stored elements in Compressed Sparse Row format>

In [15]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [16]:
all_company_names = pd.concat([registry['company_name'].dropna(), license['company_name'].dropna()]).unique()

In [17]:
len(all_company_names)

196713

In [18]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_company_names)

In [19]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 102.27375102043152


In [20]:
matches

<196713x196713 sparse matrix of type '<class 'numpy.float64'>'
	with 214256 stored elements in Compressed Sparse Row format>

In [21]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similarity': similairity})

In [22]:
matches_df = get_matches_df(matches, all_company_names, top=100000)

In [23]:
pd.options.display.max_rows = 999

In [26]:
matched = matches_df.loc[(matches_df['similarity'] < 0.99999) & (matches_df['similarity'] > 0.94)]

In [27]:
len(matched)

768