In [19]:
import pandas as pd
import numpy as np
import time
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danterangel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:

str_replaces_list = [
    ['*',' '],
    ['.',' '],
    ['-',' '],
    ['-',' '],
    ['01 ',''],
    ['02 ',''],
    ['03 ',''],
    ['04 ',''],
    ['05 ',''],
    ['06 ',''],
    ['07 ',''],
    ['08 ',''],
    ['09 ',''],
    ['/', ' '],
    ['AMEX','American Express'],
    ['IRS','Internal Revenue Service'],
    ['SYNCB','SYNCHRONY'],
    ['JPMCB - CARD SERVICE','JPMCB - CARD'],
    ['JPMCB','JPMChase Bank'],
    ['JPM CB','JPMChase Bank'],
    ['5TH 3RD','Fifth third'],
    ['5/3','Fifth third'],
    ['1NEVADACU','One Nevada'],
    ['CBNA','Citibank, N.A.'],
    ['TOYOTA MOTOR CREDIT','Toyota Financial Services'],
    ['ADVANT','Advantage'],
    ['CAPONE','Capital one'],
    ['CRED','Credit'],
    ['Ins ', 'Insurance '],
    ['P & C', 'Property & Casualty'],
    ['P&C', 'Property & Casualty'],
    ['P& C', 'Property & Casualty'],
    ['PROP & CAS', 'PROPERTY & CASUALTY'],
    ['INS ', 'INSURANCE '],
    ['CORP ', 'CORPORATE '],
    ['Ins. Co.', 'Insurance Companty'],
    ['Cas.', 'Casuality'],
    ['CAS UALTY', 'CASUALTY'],
    ['CAS CO', 'CASUALITY COMPANY'],
    ['COUNTRY INS', 'COMPANY INSURANCE'],
    ['FBSD','1st Financial Bank'],
    ['121FCU','Financial Credit Union'],
    ['121 FCU','Financial Credit Union'],
    ['1st Comm CU','first Community credit union'],
    ['1ST COMM','first Community'],
    ['1ST FEDERAL','first federal'],
    ['1ST FINANCIAL BANK ','first financial bank']
]
#str_replaces_list = []
b_words = [
    'insurance',
    'insurances',
    'bank',
    'counties',
    'credit',
    'finance',
    'financial',
    'services',
    'service',
    'credit',
    'service',
    'corporate',
    'company',
    'casuality',
    'exchange',
    'association',
    'cooperative',
    'property'
]

b_words = stopwords.words('english') + b_words

In [21]:
def str_replace(string, str_replaces_list):
    for i in str_replaces_list:
        string = string.replace(i[0], i[1])
    return string.lower()

In [22]:
def ngrams(string, n=12):
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    string = str_replace(string,str_replaces_list)
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [23]:
def train_vectorize(org_name_clean, unique_org,analyzer= 'char_wb'):
    t1 = time.time()
    print('=====================================================Vecorizing the data - this could take a few minutes for large datasets... =====================================================')
    vectorizer = TfidfVectorizer(min_df=1, analyzer=analyzer, stop_words=b_words)#, lowercase=False)
    tfidf = vectorizer.fit_transform(org_name_clean)
    t = time.time()-t1
    print("=====================  COMPLETED IN:", t)
    print('=====================  Vecorizing completed...')
    nbrs = NearestNeighbors(n_neighbors=NEIGHBORS, n_jobs=-1).fit(tfidf)
    query_tf_idf = vectorizer.transform(unique_org)
    distances, indices = nbrs.kneighbors(query_tf_idf)
    return distances, indices

In [24]:
def find_matches(matches,indices, distances, org_name_clean, unique_org, invert= True):
    
    t1 = time.time()
    print('=====================  finding matches...')
    for i,j in enumerate(indices):
        for k in range(0,len(j)):
            str1 = org_name_clean[j[k]].lower()
            str2 = unique_org[i].lower()
            similarity = fuzz.ratio(str1,str2)
            similarity_token_sort_ratio = fuzz.token_sort_ratio(str1,str2)
            if similarity > FILTER_SIMILARITY or similarity_token_sort_ratio > FILTER_SIMILARITY:
                if invert == True:
                    arcus_name = org_name_clean[j[k]]
                    quicken_name = unique_org[i]
                else:                    
                    arcus_name = unique_org[i]
                    quicken_name = org_name_clean[j[k]]
                temp = [round(distances[i][k],2), arcus_name , quicken_name, similarity, similarity_token_sort_ratio]
                matches.append(temp)

    t = time.time() - t1
    print("=====================  COMPLETED IN:", t)
    return matches

## Files Names

In [25]:
file = "data_all.xlsx"
file_quicken = 'full_data_quicken.xlsx'
FILTER_SIMILARITY = 55
NEIGHBORS = 40
ANALIZERS = [
    'char_wb',
    'char',
    'word',
    ngrams
]

## Get the Quicken Dataset

In [26]:
dataset = pd.read_excel(file_quicken, usecols=["Payable To"])
print(dataset.shape, 'dataset completo')
dataset = pd.DataFrame({'name': dataset['Payable To'].unique()})
dataset = dataset
print(dataset.shape, 'dataset con datos unicos')

(345610, 1) dataset completo
(21109, 1) dataset con datos unicos


In [27]:
dataset.head(5)

Unnamed: 0,name
0,SYNCB/PPC
1,CapitalOne
2,SYNCB/LOW
3,Goldenwest FCU
4,DISCOVER FIN SVCS LLC


## Get Arcus Dataset

In [28]:
payees = pd.read_excel(file,sheet_name='USA-Full List')
payees.dropna(subset=['Name'], inplace=True)
payees['name'] = payees['Name']
payees.drop('Name', axis='columns', inplace=True)
print(payees.shape, " dataset arcus completo")

(16398, 2)  dataset arcus completo


In [29]:
payees.head(5)

Unnamed: 0,Category,name
0,Airline,American Airlines
1,Airline,United Travel Card
2,Auto Finance,"Ally Financial, Inc."
3,Auto Finance,American Honda Finance Corp
4,Auto Finance,BMW Financial Services


## Copy Quicken Dataset and replace with our manual filter data and lower each string

In [30]:
dataset_client = dataset.copy()
dataset_client.name = dataset_client.apply(lambda row : str_replace(row['name'], str_replaces_list), axis = 1)
dataset_client = dataset_client.name

## Copy Arcus Dataset and replace with our manual filter data and lower each string

In [31]:
dataset_payees = payees.copy()
dataset_payees.name = dataset_payees.apply(lambda row : str_replace(row['name'], str_replaces_list), axis = 1)
dataset_payees = dataset_payees.name

### Match names Arcus vs Quicken with KNeighbors

In [32]:
clean_org_names = dataset_payees
org_name_clean = payees.name
matches = []

In [33]:
print("=====================  Match names Arcus vs Quicken with KNeighbors")
t1 = time.time()
unique_org = dataset_client # set used for increased performance
for i in ANALIZERS:
    distances, indices = train_vectorize(org_name_clean, unique_org, analyzer=i)
    unique_org = dataset.name
    matches = matches + find_matches(matches,indices, distances, org_name_clean, unique_org)
t = time.time() - t1
print("=====================  COMPLETED IN:", t)




In [34]:
print('Building data frame...')  
matches_df = pd.DataFrame(matches, columns=['match confidence(knn)','Arcus name','Quicken name', 'similarity', 'similarity_token_sort_ratio'])
print('Done')

Building data frame...
Done


In [35]:
matches_df.sample(20)

Unnamed: 0,match confidence(knn),Arcus name,Quicken name,similarity,similarity_token_sort_ratio
4817298,0.59,Sanitary Garbage Company,Grange Property & Casualty Company,52,57
1928347,0.28,Rogue Federal Credit Union,Energy One Federal Credit Union,84,81
152549,0.41,Family Security Credit Union,University of Kentucky Federal Credit Union,54,56
4814016,0.52,Polam FCU,CAP COMM FCU,67,38
3386725,0.52,Lennox Employees Credit Union,PUBLIX EMPLOYEES FED C,63,63
816780,0.37,Coastal Federal Credit Union,HANSCOM FEDERAL CRED,62,62
4538637,0.4,Clermont County Treasurer,Outagamie County Treasurer,75,71
70321,0.46,Charter Township of Canton,CALN TOWNSHIP,56,62
2360117,1.21,First Financial Federal Credit Union,TOPLINE FEDERAL CRD UN,62,59
3927287,0.36,Erie County Water Authority,WRIGHT COUNTY AUDITOR/ TREASURER,58,66


### Match names Quicken vs Arcus with KNeighbors

In [36]:
clean_org_names = dataset_client
org_name_clean = dataset.name

In [None]:
print("=====================  Match names Arcus vs Quicken with KNeighbors")
t1 = time.time()
unique_org = dataset_payees
for i in ANALIZERS:
    distances, indices = train_vectorize(org_name_clean, unique_org, analyzer=i)
    unique_org = payees.name
    matches = matches + find_matches(matches,indices, distances, org_name_clean, unique_org, invert= False)
print("=====================  COMPLETED IN:", t)



In [None]:
print('Building data frame...')  
matches_df = pd.DataFrame(matches, columns=['match confidence(knn)','Arcus name','Quicken name', 'similarity', 'similarity_token_sort_ratio'])
print('Done')

In [None]:
matches_df.sample(20, random_state=42)

### Match names Quicken vs Arcus with KNeighbors

In [None]:
matches_df = matches_df.drop_duplicates(subset=['Arcus name', 'Quicken name'])
matches_df.shape

In [None]:
matches_df[matches_df['similarity_token_sort_ratio'] > 65]

In [None]:
capital_one =  matches_df[matches_df['Quicken name'] == 'CAPITAL ONE BANK USA N']
capital_one[capital_one['similarity_token_sort_ratio'] > 90]
capital_one.sort_values('similarity_token_sort_ratio', ascending=False)

In [None]:
capital_one = matches_df[matches_df['Quicken name'] == 'CAPITAL ONE']
capital_one[capital_one['similarity_token_sort_ratio'] > 85]
capital_one.sort_values('similarity_token_sort_ratio', ascending=False).drop_duplicates(subset=['Arcus name', 'Quicken name'])

In [None]:
matches_df[matches_df['Quicken name'] == 'BARCLAYS BANK DELAWA']

In [None]:
matches_df[matches_df['Quicken name'] == 'AMERICAN HONDA FINANCE']

In [None]:
matches_df[matches_df['Quicken name'] == 'BANK OF AMER']

In [None]:
matches_df[matches_df['Quicken name'] == 'Bank of America']

In [None]:
bank = matches_df[matches_df['Quicken name'] == 'BANK OF AMER']
bank.sort_values(by=['similarity_token_sort_ratio'], ascending=False)

In [None]:
amica_mutual = matches_df[matches_df['Arcus name'] == 'Amica Mutual Insurance Company']
amica_mutual.sort_values(by='similarity_token_sort_ratio', ascending=False)

In [None]:
matches_df[matches_df['Quicken name'] == 'Partners Federal CU']

In [None]:
len(matches_df['Quicken name'].unique())

In [None]:
len(matches_df['Quicken name'])

In [None]:
len(matches_df['Arcus name'].unique())

In [None]:
len(matches_df['Arcus name'])

In [None]:
result = payees.merge(matches_df, left_on='name', right_on='Arcus name', how='inner')

In [None]:
amica_mutual = result[result['Arcus name'] == 'Amica Mutual Insurance Company']
amica_mutual.sort_values(by='similarity_token_sort_ratio', ascending=False)

In [None]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

In [None]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

In [None]:
sim_m_75 = matches_df[matches_df['similarity'] > 60]

In [None]:
len(sim_m_75['Quicken name'].unique())

In [None]:
sim_a_75 = matches_df[(matches_df['match confidence(knn)'] < 1) & (matches_df['match confidence(knn)'] > 0 )]

In [None]:
sim_a_75.sort_values(by=['match confidence(knn)', 'similarity'], ascending=True)

In [None]:
matches_df.to_excel("matched_names_tf_idf.xlsx", engine="xlsxwriter", index=False)
matches_df.to_csv("matched_names_tf_idf.csv", index=False)

In [None]:
matches_df.shape

In [None]:
print(len(matches_df['Quicken name'].unique()), ' quicken unique')
print(len(matches_df['Arcus name'].unique()), ' arcus unique')
print(matches_df.shape, ' all data')

In [None]:
nNames = len(matches_df['Quicken name'].unique())
print("Total dataset", dataset_client.shape[0])
print("Total Names found ", nNames)
print("Percent : ", 100 * nNames / dataset_client.shape[0], '%')