In [None]:
import re

import pandas as pd

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

from config.config import THRESHOLDS, RAW_DIR, MAX_WORKERS
from config.profiler import Profiler
from utils.io import export_data_for_validation
from utils.format import normalize_text
from scripts.datahandler import get_data

In [None]:
match_norm_series = None
match_original_series = None
data_norm_series = None
data_original_series = None

In [None]:
def regex_matching(s1_clean, s2_series):
    pattern = re.compile(re.escape(s1_clean), re.IGNORECASE)
    return s2_series.str.contains(pattern)


def reverse_regex_matching(s1_clean, s2_series):
    return s2_series.apply(lambda s2: bool(re.search(re.escape(s2), s1_clean, re.IGNORECASE)))


def calculate_best_match(idx):
    match_original = match_original_series.iloc[idx]
    match_norm = match_norm_series.iloc[idx]
    s1_clean = match_norm

    s1_in_s2 = regex_matching(s1_clean, data_norm_series)
    s2_in_s1 = reverse_regex_matching(s1_clean, data_norm_series)
    mask = s1_in_s2 | s2_in_s1
    matches = data_original_series[mask]

    if not matches.empty:
        top_match = matches.iloc[0]
        score = True
    else:
        top_match = None
        score = False

    return {
        "MATCH": match_original,
        "BEST_FOUND_MATCH": top_match,
        "TRUE_MATCH": None,
        "BEST_MATCH": score
    }

In [None]:
TEST_FLAG = False
DATA_COLUMN = "STREET"
MATCH_COLUMN = "STREET"
MATCH_FILE_NAME = "847ff2869e9fa08110422d98fe15553a1931fc8d59876977b3ab0f45.csv"
MATCH_FILE = RAW_DIR.joinpath(MATCH_FILE_NAME)

DATA, MATCH = get_data(TEST_FLAG, MATCH_FILE)

DATA.reset_index(drop=True, inplace=True)
MATCH.reset_index(drop=True, inplace=True)

In [None]:
# File specific code
...
DATA = DATA.assign(STREET=DATA["STREET_NAME"] + " " + DATA["STREET_NO"])

In [None]:
DATA[DATA_COLUMN].fillna("", inplace=True)
data_norm_series = DATA[DATA_COLUMN].apply(normalize_text)
data_original_series = DATA[DATA_COLUMN]

MATCH[MATCH_COLUMN].fillna("", inplace=True)
match_norm_series = MATCH[MATCH_COLUMN].apply(normalize_text)
match_original_series = MATCH[MATCH_COLUMN]

In [None]:
indices = MATCH.index.tolist()

In [None]:
def perform_matching():
    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(calculate_best_match, idx): idx for idx in indices}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Matching", unit="match"):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Exception occurred for index {futures[future]}: {e}")
        return results

In [None]:
prof = Profiler(name=f"{MATCH_FILE_NAME[:13]}_regex")
prof.enable()
results = perform_matching()
prof.disable()
prof.save_show_profile()

In [None]:
results_df = pd.DataFrame(results)
results_df["BEST_MATCH_BINARY"] = results_df["BEST_MATCH"] >= THRESHOLDS.regex
export_data_for_validation(results_df, MATCH_FILE_NAME, DATA_COLUMN, "REGEX")