In [None]:
import pandas as pd
import ray
from name_matching.name_matcher import NameMatcher

Initiate ray

In [None]:
ray.init()

Load some dummy data

In [None]:
adjusted_names = pd.read_csv('../test/adjusted_test_names.csv', index_col=0)
test_names = pd.read_csv('../test/test_names.csv', index_col=0)
display(adjusted_names.head(5))
display(test_names.head(5))

Initiate the NameMatcher object on the company_name column

In [None]:
matcher = NameMatcher(  ngrams=(2, 5),
                        top_n=10,
                        number_of_rows=500,
                        number_of_matches=3,
                        lowercase=True,
                        punctuations=True,
                        remove_ascii=True,
                        legal_suffixes=False,
                        common_words=False,
                        preprocess_split=False,
                        verbose=False)

Set the desired string matching metrics

In [None]:
matcher.set_distance_metrics(['iterative_sub_string', 'pearson_ii', 'bag', 'fuzzy_wuzzy_partial_string', 'editex'])

Load the main part of the data that should be matched

In [None]:
matcher.load_and_process_master_data('company_name', test_names, transform=True)

Define a function for the running of the name matching

In [None]:
@ray.remote
def match_name_parallel(adjusted_names, matcher):
    results = matcher.match_names(to_be_matched=adjusted_names, column_matching='company_name')
    return results

Split the names in instances of 100 and add it to a list of remote function calls

In [None]:
results = []
for i in range(0, len(adjusted_names), 100):
    results.append(match_name_parallel.remote(adjusted_names[i:i+100], matcher))

Get the results once all of the workers finished their work

In [None]:
matches = pd.concat(ray.get(results))

recombine the data

In [None]:
complete_matched_data = pd.merge(pd.merge(test_names, matches, how='left', right_index=True, left_index=True), adjusted_names, how='left', left_on='match_index_0', right_index=True, suffixes=['', '_matched'])
complete_matched_data