In [98]:
import pandas as pd
import numpy as np
import time
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danterangel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:

str_replaces_list = [
    ['*',' '],
    ['.',' '],
    ['-',' '],
    ['-',' '],
    ['01 ',''],
    ['02 ',''],
    ['03 ',''],
    ['04 ',''],
    ['05 ',''],
    ['06 ',''],
    ['07 ',''],
    ['08 ',''],
    ['09 ',''],
    ['/', ' '],
    ['AMEX','American Express'],
    ['IRS','Internal Revenue Service'],
    ['SYNCB','SYNCHRONY'],
    ['JPMCB - CARD SERVICE','JPMCB - CARD'],
    ['JPMCB','JPMChase Bank'],
    ['JPM CB','JPMChase Bank'],
    ['5TH 3RD','Fifth third'],
    ['5/3','Fifth third'],
    ['1NEVADACU','One Nevada'],
    ['CBNA','Citibank, N.A.'],
    ['TOYOTA MOTOR CREDIT','Toyota Financial Services'],
    ['ADVANT','Advantage'],
    ['CAPONE','Capital one'],
    ['CRED','Credit'],
    ['Ins ', 'Insurance '],
    ['P & C', 'Property & Casualty'],
    ['P&C', 'Property & Casualty'],
    ['P& C', 'Property & Casualty'],
    ['PROP & CAS', 'PROPERTY & CASUALTY'],
    ['INS ', 'INSURANCE '],
    ['CORP ', 'CORPORATE '],
    ['Ins. Co.', 'Insurance Companty'],
    ['Cas.', 'Casuality'],
    ['CAS UALTY', 'CASUALTY'],
    ['CAS CO', 'CASUALITY COMPANY'],
    ['COUNTRY INS', 'COMPANY INSURANCE'],
    ['FBSD','1st Financial Bank'],
    ['121FCU','Financial Credit Union'],
    ['121 FCU','Financial Credit Union'],
    ['1st Comm CU','first Community credit union'],
    ['1ST COMM','first Community'],
    ['1ST FEDERAL','first federal'],
    ['1ST FINANCIAL BANK ','first financial bank']
]
#str_replaces_list = []
b_words = [
    'insurance',
    'insurances',
    'bank',
    'counties',
    'credit',
    'finance',
    'financial',
    'services',
    'service',
    'credit',
    'service',
    'corporate',
    'company',
    'casuality',
    'exchange',
    'association',
    'cooperative',
    'property'
]

b_words = stopwords.words('english') + b_words

In [100]:
def str_replace(string, str_replaces_list):
    for i in str_replaces_list:
        string = string.replace(i[0], i[1])
    return string.lower()

In [101]:
def ngrams(string, n=12):
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    string = str_replace(string,str_replaces_list)
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [102]:
def train_vectorize(org_name_clean, unique_org,analyzer= 'char_wb'):
    t1 = time.time()
    print('=====================================================Vecorizing the data - this could take a few minutes for large datasets... =====================================================')
    vectorizer = TfidfVectorizer(min_df=1, analyzer=analyzer, stop_words=b_words)#, lowercase=False)
    tfidf = vectorizer.fit_transform(org_name_clean)
    t = time.time()-t1
    print("=====================  COMPLETED IN:", t)
    print('=====================  Vecorizing completed...')
    nbrs = NearestNeighbors(n_neighbors=NEIGHBORS, n_jobs=-1).fit(tfidf)
    query_tf_idf = vectorizer.transform(unique_org)
    distances, indices = nbrs.kneighbors(query_tf_idf)
    return distances, indices

In [103]:
def find_matches(matches,indices, distances, org_name_clean, unique_org, invert= True):
    
    t1 = time.time()
    print('=====================  finding matches...')
    for i,j in enumerate(indices):
        for k in range(0,len(j)):
            str1 = org_name_clean[j[k]].lower()
            str2 = unique_org[i].lower()
            similarity = fuzz.ratio(str1,str2)
            similarity_token_sort_ratio = fuzz.token_sort_ratio(str1,str2)
            if similarity > FILTER_SIMILARITY or similarity_token_sort_ratio > FILTER_SIMILARITY:
                if invert == True:
                    arcus_name = org_name_clean[j[k]]
                    quicken_name = unique_org[i]
                else:                    
                    arcus_name = unique_org[i]
                    quicken_name = org_name_clean[j[k]]
                temp = [round(distances[i][k],2), arcus_name , quicken_name, similarity, similarity_token_sort_ratio]
                matches.append(temp)

    t = time.time() - t1
    print("=====================  COMPLETED IN:", t)
    return matches

## Files Names

In [104]:
file = "data_all.xlsx"
file_quicken = 'full_data_quicken.xlsx'
FILTER_SIMILARITY = 55
NEIGHBORS = 40
ANALIZERS = [
    'char_wb',
    'char',
    'word',
    ngrams
]

## Get the Quicken Dataset

In [105]:
dataset = pd.read_excel(file_quicken, usecols=["Payable To"])
print(dataset.shape, 'dataset completo')
dataset = pd.DataFrame({'name': dataset['Payable To'].unique()})
dataset = dataset
print(dataset.shape, 'dataset con datos unicos')

(345610, 1) dataset completo
(21109, 1) dataset con datos unicos


In [106]:
dataset.head(5)

Unnamed: 0,name
0,SYNCB/PPC
1,CapitalOne
2,SYNCB/LOW
3,Goldenwest FCU
4,DISCOVER FIN SVCS LLC


## Get Arcus Dataset

In [107]:
payees = pd.read_excel(file,sheet_name='USA-Full List')
payees.dropna(subset=['Name'], inplace=True)
payees['name'] = payees['Name']
payees.drop('Name', axis='columns', inplace=True)
print(payees.shape, " dataset arcus completo")

(16398, 2)  dataset arcus completo


In [108]:
payees.head(5)

Unnamed: 0,Category,name
0,Airline,American Airlines
1,Airline,United Travel Card
2,Auto Finance,"Ally Financial, Inc."
3,Auto Finance,American Honda Finance Corp
4,Auto Finance,BMW Financial Services


## Copy Quicken Dataset and replace with our manual filter data and lower each string

In [109]:
dataset_client = dataset.copy()
dataset_client.name = dataset_client.apply(lambda row : str_replace(row['name'], str_replaces_list), axis = 1)
dataset_client = dataset_client.name

## Copy Arcus Dataset and replace with our manual filter data and lower each string

In [110]:
dataset_payees = payees.copy()
dataset_payees.name = dataset_payees.apply(lambda row : str_replace(row['name'], str_replaces_list), axis = 1)
dataset_payees = dataset_payees.name

### Match names Arcus vs Quicken with KNeighbors

In [111]:
clean_org_names = dataset_payees
org_name_clean = payees.name
matches = []



In [112]:
print("=====================  Match names Arcus vs Quicken with KNeighbors")
t1 = time.time()
unique_org = dataset_client # set used for increased performance
for i in ANALIZERS:
    distances, indices = train_vectorize(org_name_clean, unique_org, analyzer=i)
    unique_org = dataset.name
    matches = matches + find_matches(matches,indices, distances, org_name_clean, unique_org)
t = time.time() - t1
print("=====================  COMPLETED IN:", t)




In [113]:
print('Building data frame...')  
matches_df = pd.DataFrame(matches, columns=['match confidence(knn)','Arcus name','Quicken name', 'similarity', 'similarity_token_sort_ratio'])
print('Done')

Building data frame...
Done


In [114]:
matches_df.sample(20)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
1202384,1.0,Service Finance Company,Hawks View Insurance,51,56
1018959,0.56,Village of Argenta,UNION GROVE VILLAGE,38,65
1274396,0.32,Southern California Edison,Southern Fidelity Ins Co,60,60
2329733,0.83,AAA Hawaii Insurance,AAA INSURANCE,76,79
139664,0.42,First Sentry Bank Loan,Bank of Travelers Rest,32,59
170152,0.63,Quality Water Solutions Inc,"Quicken Loans, Inc",58,45
3189956,0.43,Penn Treaty Network America Insurance Company,Great American E&S Insurance Company,73,64
1445791,0.38,United American Insurance,ERIE INSURANCE,63,67
2151271,0.53,Henry County Water,MCHENRY COUNTY,75,75
3106783,0.41,Russian Media Group,GERMANIA INS GROUP,59,54


### Match names Quicken vs Arcus with KNeighbors

In [115]:
clean_org_names = dataset_client
org_name_clean = dataset.name

In [116]:
print("=====================  Match names Arcus vs Quicken with KNeighbors")
t1 = time.time()
unique_org = dataset_payees
for i in ANALIZERS:
    distances, indices = train_vectorize(org_name_clean, unique_org, analyzer=i)
    unique_org = payees.name
    matches = matches + find_matches(matches,indices, distances, org_name_clean, unique_org, invert= False)
print("=====================  COMPLETED IN:", t)



In [117]:
print('Building data frame...')  
matches_df = pd.DataFrame(matches, columns=['match confidence(knn)','Arcus name','Quicken name', 'similarity', 'similarity_token_sort_ratio'])
print('Done')

Building data frame...
Done


In [127]:
matches_df.sample(20, random_state=42)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
162432,0.31,Seattle Metropolitan Credit Union Loan,ATLANTA POSTAL CREDIT UNION,65,55
136651,0.33,Loyal American Insurance Co,Central Insurance Company,62,62
169250,0.48,Edison Tax Collector,MELBA KREADY TAX COLLECTOR,70,70
1080006,0.9,Old National Bank,NATIONAL FINANCE 120,59,59
1176894,0.59,Rural Mutual Ins.,WAWANESA MUTUAL INS COMPANY,55,56
83917,0.43,Freedom Federal Credit Union Loan,WELCOME FEDERAL CRED,60,60
1203566,1.09,Wolverine Mutual Insurance Co,Amica MUTUAL INSURANCE CO,78,70
5088726,0.45,Turnstone Homeowners Association,TEXAS WINDSTORM INSURANCE ASSOCIATION,58,58
1087781,1.16,Community Financial Services FCU,NASSAU FINANCIAL FCU,58,62
96593,0.35,Scient Federal Credit Union Loan,Heartland Credit Union,59,67


### Match names Quicken vs Arcus with KNeighbors

In [128]:
matches_df = matches_df.drop_duplicates(subset=['Arcus name', 'Quicken name'])
matches_df.shape

(417500, 5)

In [129]:
matches_df[matches_df['similarity_token_sort_ratio'] > 65]

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
0,0.31,Capital One,CapitalOne,95,95
2,0.45,Old West FCU,Goldenwest FCU,85,85
18,0.36,Brauns Capital One RS,CAPITAL ONE BANK USA N,60,79
43,0.00,Barclays Bank Delaware,BARCLAYS BANK DELAWARE,100,100
47,0.49,Barclays Bank Consumer Loans,BARCLAYS BANK DELAWARE,68,68
...,...,...,...,...,...
41740483,0.97,Water And Power Community Credit Union,OUR COMMUNITY CREDIT,66,69
41740485,0.99,Water And Power Community Credit Union,U of VA Community Credit Union,74,76
41740502,0.97,Westerly Community Credit Union Loan,HAPO COMMUNITY CREDIT UNION,75,76
41740509,1.00,Winslow Community FCU Loan,UTAH COMMUNITY FCU,64,68


In [157]:
capital_one =  matches_df[matches_df['Quicken name'] == 'CAPITAL ONE BANK USA N']
capital_one[capital_one['similarity_token_sort_ratio'] > 90]
capital_one.sort_values('similarity_token_sort_ratio', ascending=False)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
18,0.36,Brauns Capital One RS,CAPITAL ONE BANK USA N,60,79
5121382,0.41,Amana Capital One RS,CAPITAL ONE BANK USA N,62,76
5121667,0.44,BNB RS CAPITAL ONE RS,CAPITAL ONE BANK USA N,60,74
1072771,1.06,BJ's Capital One RS,CAPITAL ONE BANK USA N,63,73
1072759,0.97,"Capital One, NA",CAPITAL ONE BANK USA N,76,72
...,...,...,...,...,...
359135,0.50,Pinnacle Bank Nebraska,CAPITAL ONE BANK USA N,59,50
17,0.35,American State Bank Loan,CAPITAL ONE BANK USA N,57,48
5063917,0.46,ACNB Bank Loan,CAPITAL ONE BANK USA N,56,44
5126334,0.43,Associated Bank Loan,CAPITAL ONE BANK USA N,57,43


In [131]:
capital_one = matches_df[matches_df['Quicken name'] == 'CAPITAL ONE']
capital_one[capital_one['similarity_token_sort_ratio'] > 85]
capital_one.sort_values('similarity_token_sort_ratio', ascending=False).drop_duplicates(subset=['Arcus name', 'Quicken name'])

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
345,0.0,Capital One,CAPITAL ONE,100,100
1073117,0.75,"Capital One, NA",CAPITAL ONE,85,88
20854479,0.78,Carpet One,CAPITAL ONE,76,76
20854155,0.87,BDP CAPITAL ONE RS,CAPITAL ONE,76,76
1073141,0.92,Bose Capital One RS,CAPITAL ONE,73,73
1073136,0.92,Sony Capital One RS,CAPITAL ONE,73,73
1073127,0.9,BJ's Capital One RS,CAPITAL ONE,73,73
1073119,0.82,York Capital One RS,CAPITAL ONE,73,73
1073118,0.8,VISA CAPITAL ONE RS,CAPITAL ONE,73,73
1073137,0.92,UBID CAPITAL ONE RS,CAPITAL ONE,73,73


In [132]:
matches_df[matches_df['Quicken name'] == 'BARCLAYS BANK DELAWA']

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
1478,0.14,Barclays Bank Delaware,BARCLAYS BANK DELAWA,95,95
1479,0.39,The Callaway Bank,BARCLAYS BANK DELAWA,49,59
1480,0.42,Baldwin State Bank Loan,BARCLAYS BANK DELAWA,56,47
1481,0.45,Brady National Bank Loan,BARCLAYS BANK DELAWA,55,59
1482,0.48,Badger Bank Loan,BARCLAYS BANK DELAWA,61,44
1483,0.49,Barclays Bank Consumer Loans,BARCLAYS BANK DELAWA,71,71
1484,0.49,Abby Bank Loan,BARCLAYS BANK DELAWA,59,41
1485,0.49,Dart Bank Loan,BARCLAYS BANK DELAWA,59,59
1486,0.5,Legacy Bank Loan,BARCLAYS BANK DELAWA,61,61
1487,0.51,Heartland Bank,BARCLAYS BANK DELAWA,53,59


In [133]:
matches_df[matches_df['Quicken name'] == 'AMERICAN HONDA FINANCE']

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
1545,0.25,American Honda Finance Corp,AMERICAN HONDA FINANCE,90,90
1546,0.37,American Finco,AMERICAN HONDA FINANCE,72,78
1547,0.39,American Eagle Financial Credit Union,AMERICAN HONDA FINANCE,61,61
1548,0.4,Mahindra Finance USA LLC,AMERICAN HONDA FINANCE,61,52
1549,0.41,Mariner Finance,AMERICAN HONDA FINANCE,65,49
1550,0.42,Old American Insurance,AMERICAN HONDA FINANCE,68,82
1551,0.43,Indiana American Water Company,AMERICAN HONDA FINANCE,49,58
1552,0.43,Life Insurance Company of North America,AMERICAN HONDA FINANCE,39,59
1553,0.44,American National Insurance Company,AMERICAN HONDA FINANCE,67,67
1554,0.45,Combined Insurance Company of America,AMERICAN HONDA FINANCE,41,58


In [134]:
matches_df[matches_df['Quicken name'] == 'BANK OF AMER']

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
16924,0.37,Bank of Commerce Loan,BANK OF AMER,67,61
16925,0.38,Bank of America - AFS,BANK OF AMER,73,77
16926,0.39,Bank of Commerce,BANK OF AMER,79,50
16927,0.4,First Bank of Berne,BANK OF AMER,65,45
16928,0.41,Bank of America Line of Credit,BANK OF AMER,57,57
16929,0.41,Bank of Oak Ridge,BANK OF AMER,69,48
16930,0.42,First American Bank,BANK OF AMER,39,71
16931,0.43,Bank of Early Loan,BANK OF AMER,67,60
16932,0.43,Bank of The Prairie Loan,BANK OF AMER,56,39
16933,0.44,Bank of New Madrid Loan,BANK OF AMER,57,46


In [135]:
matches_df[matches_df['Quicken name'] == 'Bank of America']

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
56595,0.31,First American Bank,Bank of America,53,82
56596,0.35,Bank of America - AFS,Bank of America,83,88
56597,0.35,Bank of America Line of Credit,Bank of America,67,67
56598,0.36,Bank of America Consumer Loans MA,Bank of America,62,62
56599,0.36,Bank of America Consumer Loans RI,Bank of America,62,62
56600,0.36,Bank of America Consumer Loans PA,Bank of America,62,62
56601,0.37,Bank of America Consumer Loans CT,Bank of America,62,62
56602,0.38,Bank of Commerce Loan,Bank of America,72,61
56603,0.38,Bank of America Consumer Loans ME,Bank of America,62,62
56604,0.4,Bank of America Consumer Loans NH,Bank of America,62,62


In [136]:
bank = matches_df[matches_df['Quicken name'] == 'BANK OF AMER']
bank.sort_values(by=['similarity_token_sort_ratio'], ascending=False)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
16925,0.38,Bank of America - AFS,BANK OF AMER,73,77
16930,0.42,First American Bank,BANK OF AMER,39,71
375724,0.58,Bank of America Mortgage,BANK OF AMER,67,67
16924,0.37,Bank of Commerce Loan,BANK OF AMER,67,61
16931,0.43,Bank of Early Loan,BANK OF AMER,67,60
16935,0.45,Bank of Utah,BANK OF AMER,75,58
5073864,0.49,Bean Creek H.O.A,BANK OF AMER,50,57
16928,0.41,Bank of America Line of Credit,BANK OF AMER,57,57
1090115,0.76,New Amer Fund,BANK OF AMER,48,56
16934,0.45,American Bank of the North Loan,BANK OF AMER,47,56


In [137]:
amica_mutual = matches_df[matches_df['Arcus name'] == 'Amica Mutual Insurance Company']
amica_mutual.sort_values(by='similarity_token_sort_ratio', ascending=False)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
143783,0.00,Amica Mutual Insurance Company,Amica Mutual INsurance Company,100,100
175302,0.00,Amica Mutual Insurance Company,AMICA MUTUAL INSURANCE COMPANY,100,100
38721,0.00,Amica Mutual Insurance Company,Amica Mutual Insurance Company,100,100
44697,0.25,Amica Mutual Insurance Company,Alfa Mutual Insurance Company,92,92
161722,0.25,Amica Mutual Insurance Company,ALFA MUTUAL INSURANCE COMPANY,92,92
...,...,...,...,...,...
119405,0.42,Amica Mutual Insurance Company,MUNICIPAL MUTUAL INS,60,52
138638,0.39,Amica Mutual Insurance Company,MUSCATINE MUTUAL INS ASS OCIATION,60,51
51703,0.48,Amica Mutual Insurance Company,WAWANESA MUTUAL INS CO,62,50
125017,0.46,Amica Mutual Insurance Company,Farm Bureau Casualty Insurance,57,50


In [138]:
matches_df[matches_df['Quicken name'] == 'Partners Federal CU']

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
106944,0.31,Partners Federal Credit Union,Partners Federal CU,79,75
106945,0.38,Apple Federal Credit Union,Partners Federal CU,58,49
106946,0.38,Star USA Federal Credit Union,Partners Federal CU,58,58
106947,0.38,Realtors Federal Credit Union,Partners Federal CU,62,58
106948,0.38,APL Federal Credit Union,Partners Federal CU,56,51
106949,0.4,Strata Federal Credit Union,Partners Federal CU,57,57
106950,0.4,Altra Federal Credit Union,Partners Federal CU,62,49
106951,0.4,Fortera Federal Credit Union,Partners Federal CU,64,60
106952,0.41,CP Federal Credit Union,Partners Federal CU,57,52
106953,0.41,Harvesters Federal Credit Union Loan,Partners Federal CU,62,58


In [139]:
len(matches_df['Quicken name'].unique())

19341

In [140]:
len(matches_df['Quicken name'])

417500

In [141]:
len(matches_df['Arcus name'].unique())

14933

In [142]:
len(matches_df['Arcus name'])

417500

In [143]:
result = payees.merge(matches_df, left_on='name', right_on='Arcus name', how='inner')

In [144]:
amica_mutual = result[result['Arcus name'] == 'Amica Mutual Insurance Company']
amica_mutual.sort_values(by='similarity_token_sort_ratio', ascending=False)

Unnamed: 0,Category,name,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
216840,Insurance,Amica Mutual Insurance Company,0.00,Amica Mutual Insurance Company,Amica Mutual INsurance Company,100,100
216896,Insurance,Amica Mutual Insurance Company,0.00,Amica Mutual Insurance Company,AMICA MUTUAL INSURANCE COMPANY,100,100
216678,Insurance,Amica Mutual Insurance Company,0.00,Amica Mutual Insurance Company,Amica Mutual Insurance Company,100,100
216686,Insurance,Amica Mutual Insurance Company,0.25,Amica Mutual Insurance Company,Alfa Mutual Insurance Company,92,92
216872,Insurance,Amica Mutual Insurance Company,0.25,Amica Mutual Insurance Company,ALFA MUTUAL INSURANCE COMPANY,92,92
...,...,...,...,...,...,...,...
216806,Insurance,Amica Mutual Insurance Company,0.42,Amica Mutual Insurance Company,MUNICIPAL MUTUAL INS,60,52
216833,Insurance,Amica Mutual Insurance Company,0.39,Amica Mutual Insurance Company,MUSCATINE MUTUAL INS ASS OCIATION,60,51
216699,Insurance,Amica Mutual Insurance Company,0.48,Amica Mutual Insurance Company,WAWANESA MUTUAL INS CO,62,50
216814,Insurance,Amica Mutual Insurance Company,0.46,Amica Mutual Insurance Company,Farm Bureau Casualty Insurance,57,50


In [145]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

19341  quicken unique
14933  arcus unique
(417500, 5)  all data


In [146]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

19341  quicken unique
14933  arcus unique
(417500, 5)  all data


In [147]:
sim_m_75 = matches_df[matches_df['similarity'] > 60]

In [148]:
len(sim_m_75['Quicken name'].unique())

16139

In [149]:
sim_a_75 = matches_df[(matches_df['match confidence(knn)'] < 1) & (matches_df['match confidence(knn)'] > 0 )]

In [150]:
sim_a_75.sort_values(by=['match confidence(knn)', 'similarity'], ascending=True)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
70474,0.07,New Jersey Manufacturers Insurance Company,NEW JERSEY MANUFACTURES INSURANCE COMPANY,99,99
143165,0.07,Michigan Basic Property Insurance Assoc,Michigan Basic Property Insurance ASSc,99,99
137424,0.08,Indiana Farm Bureau Insurance,Indian Farm Bureau Insurance,98,98
154770,0.08,Xcel Federal Credit Union,EXCEL Federal Credit Union,98,78
175188,0.08,Universal Property & Casualty,Universal Property & Casualy,98,98
...,...,...,...,...,...
1173545,0.99,Harris County MUD 188,HARRIS COUNTY MUD 153,90,90
1173546,0.99,Harris County MUD 127,HARRIS COUNTY MUD 153,90,90
1173547,0.99,Harris County MUD 261,HARRIS COUNTY MUD 153,90,90
1173549,0.99,Harris County MUD 162,HARRIS COUNTY MUD 153,90,90


In [151]:
matches_df.to_excel("matched_names_tf_idf.xlsx", engine="xlsxwriter", index=False)
matches_df.to_csv("matched_names_tf_idf.csv", index=False)

In [152]:
matches_df.shape

(417500, 5)

In [153]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

19341  quicken unique
14933  arcus unique
(417500, 5)  all data


In [154]:
nNames = len(matches_df['Quicken name'].unique())
print("Total dataset", dataset_client.shape[0])
print("Total Names found ", nNames)
print("Percent : ", 100 * nNames / dataset_client.shape[0], '%')

Total dataset 21109
Total Names found  19341
Percent :  91.62442560045479 %
