In [1]:
import os
import pandas as pd
from thefuzz import process, fuzz
from typing import List, Tuple, Dict
from tqdm import tqdm

tqdm.pandas()

In [2]:
# params
focus_weight=0.8

### Preprocess function

In [3]:
to_remove_words_list = [
    "inc",
    "inc.",
    "incorporated",
    "llc",
    "l.l.c.",
    "l.l.c",
    "co.",
    "securities",
    "security",
    "s.a.",
    "s.a",
    "l.p.",
    "plc",
    "ag",
    "s.a.s",
    "s.a.s.",
    "s.p.a.",
    "s.p.a",
    "sa",
    "ab",
    "trust",
    "limited",
    "ltd",
    "ltd.",
]

replace_dict = {'bank corporation': 'bank', 'bank corporate': 'bank', 'bank corp': 'bank'}


# functions
def preprocess(cur_txt: str, to_remove_words_list: List=to_remove_words_list, replace_dict: Dict=replace_dict) -> str:
    """
    Preprocess names before matching:
    1. remove commas
    2. replace by name
    3. remove words in to_remove_words_list

    Args:
        cur_txt (str): current name
        to_remove_words_list (List[srt]): list of words to remove

    Returns:
        str: preprocessed equity name
    """
    #* 0. lower the name
    cur_txt = cur_txt.lower()
    #* 1. replace comma
    cur_txt = cur_txt.replace(",", "")
    #* 2. replace by name
    for cur_key in replace_dict:
        if cur_key in cur_txt:
            cur_txt = cur_txt.replace(cur_key, replace_dict[cur_key])
            break
    #* 3. remove words in to_remove_words_list
    cur_txt = cur_txt.split(" ")
    result_str_list = [
        cur_word
        for cur_word in cur_txt
        if cur_word not in to_remove_words_list
    ]

    return " ".join(result_str_list).strip()

### Test

In [4]:
cur_path = 'data/04_splits/00.csv'

In [5]:


cur_target = pd.read_csv(cur_path)
cur_target.head()
cur_target = cur_target.iloc[:5, :]

In [6]:
search_df = pd.read_csv('data/02_primary/search.csv')
search_df['preprocessed'] = search_df['Full Name'].apply(preprocess)
search_df.head()

Unnamed: 0,Full Name,No.,preprocessed
0,1 MAJ Metalska,1,1 maj metalska
1,1 MAJ Zavrsni Rad Gradjevinarstvu,2,1 maj zavrsni rad gradjevinarstvu
2,1300 Smiles,3,1300 smiles
3,2 Park Street Trust,4,2 park street
4,25 Novembar Celinac,5,25 novembar celinac


In [7]:
search_df.columns

Index(['Full Name', 'No. ', 'preprocessed'], dtype='object')

In [8]:
def keep_fist_n(cur_txt, n=1):
    cur_txt_list = cur_txt.split(' ')
    return ' '.join(cur_txt_list[:n])

In [9]:
cur_target['preprocessed'] = cur_target['Target Name'].apply(preprocess)
cur_target.head()

Unnamed: 0,Target Name,Target No.,preprocessed
0,3I Group PLC,190,3i group
1,3I INFRASTRUCTURE PLC,12378,3i infrastructure
2,4finance S.A.,15299,4finance
3,50Hertz Transmission GmbH,7693,50hertz transmission gmbh
4,6PM HOLDINGS P.L.C.,15484,6pm holdings p.l.c.


In [10]:
def extract_func(cur_focus_txt: str, cur_txt, search_df):
    cur_df = search_df.copy()
    cur_df['cur_focus_search_score'] = cur_df['preprocessed'].apply(lambda x: fuzz.partial_ratio(cur_focus_txt, x))
    cur_df['cur_simple_score'] = cur_df['preprocessed'].apply(lambda x: fuzz.ratio(cur_txt, x))
    cur_df['final_score'] = cur_df['cur_focus_search_score'] * focus_weight + cur_df['cur_simple_score'] * (1 - focus_weight)
    cur_df = cur_df.sort_values('final_score', ascending=False)
    
    # return the first one
    return cur_focus_txt, cur_df.iloc[0]['Full Name'], cur_df.iloc[0]['No. '], cur_df.iloc[0]['final_score']

In [11]:
matched_df = cur_target.copy()
# match first one, first two, first three
cur_first_dict = {1: 'First', 2: 'Second', 3: 'Third'}
for cur_first_n in tqdm(range(1, 4)):
    # get target df
    cur_target_df = cur_target.copy()
    print(cur_target_df.shape)
    cur_target_df['cur_focus_target'] = cur_target_df['preprocessed'].apply(keep_fist_n, n=cur_first_n)
    # apply function
    cur_focus_txt_list = []
    cur_matched_name_list = []
    cur_matched_no_list = []
    cur_matched_score_list = []
    for _, cur_row in tqdm(cur_target_df.iterrows()):
        cur_focus_txt, cur_matched_name, cur_matched_no, cur_matched_score = extract_func(cur_row['cur_focus_target'], cur_row['preprocessed'], search_df)
        cur_focus_txt_list.append(cur_focus_txt)
        cur_matched_name_list.append(cur_matched_name)
        cur_matched_no_list.append(cur_matched_no)
        cur_matched_score_list.append(cur_matched_score)
    # add to matched df
    matched_df[f'{cur_first_dict[cur_first_n]} word'] = cur_focus_txt_list
    matched_df[f'Top matched name for {cur_first_dict[cur_first_n]} word'] = cur_matched_name_list
    matched_df[f'Top matched No. for {cur_first_dict[cur_first_n]} word'] = cur_matched_name_list
    matched_df[f'Matched score for {cur_first_dict[cur_first_n]} word'] = cur_matched_score_list

  0%|          | 0/3 [00:00<?, ?it/s]

(5, 3)


5it [00:19,  3.88s/it]
 33%|███▎      | 1/3 [00:19<00:38, 19.42s/it]

(5, 3)


5it [00:27,  5.42s/it]
 67%|██████▋   | 2/3 [00:46<00:23, 23.95s/it]

(5, 3)


5it [00:28,  5.75s/it]
100%|██████████| 3/3 [01:15<00:00, 25.10s/it]


In [12]:
matched_df

Unnamed: 0,Target Name,Target No.,preprocessed,First word,Top matched name for First word,Top matched No. for First word,Matched score for First word,Second word,Top matched name for Second word,Top matched No. for Second word,Matched score for Second word,Third word,Top matched name for Third word,Top matched No. for Third word,Matched score for Third word
0,3I Group PLC,190,3i group,3i,3I Infotech,3I Infotech,88.4,3i group,Securities Group,Securities Group,95.4,3i group,Securities Group,Securities Group,95.4
1,3I INFRASTRUCTURE PLC,12378,3i infrastructure,3i,3I Infotech,3I Infotech,91.4,3i infrastructure,A Infrastructure,A Infrastructure,93.4,3i infrastructure,A Infrastructure,A Infrastructure,93.4
2,4finance S.A.,15299,4finance,4finance,INA,INA,91.0,4finance,INA,INA,91.0,4finance,INA,INA,91.0
3,50Hertz Transmission GmbH,7693,50hertz transmission gmbh,50hertz,Hertz,Hertz,86.6,50hertz transmission,Hertz,Hertz,86.6,50hertz transmission gmbh,Hertz,Hertz,86.6
4,6PM HOLDINGS P.L.C.,15484,6pm holdings p.l.c.,6pm,6PM Holdings,6PM Holdings,95.4,6pm holdings,6PM Holdings,6PM Holdings,95.4,6pm holdings p.l.c.,6PM Holdings,6PM Holdings,95.4
