In [None]:
import pandas as pd
import numpy as np

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer

from config.config import THRESHOLDS, RAW_DIR, MAX_WORKERS
from config.profiler import Profiler
from utils.io import export_data_for_validation
from utils.format import normalize_text
from scripts.datahandler import get_data

In [None]:
match_norm_series = None
match_original_series = None
data_original_series = None
data_vectors = None
vectorizer = None

In [None]:
def calculate_best_match(idx):
    match_original = match_original_series.iloc[idx]
    match_norm = match_norm_series.iloc[idx]
    s1_clean = match_norm.strip()

    if not s1_clean:
        return {
            "MATCH": match_original,
            "BEST_FOUND_MATCH": None,
            "TRUE_MATCH": None,
            "BEST_MATCH": None,
        }

    match_vector = vectorizer.transform([s1_clean])

    intersection = data_vectors.multiply(match_vector).sum(axis=1)
    union = data_vectors.sum(axis=1) + match_vector.sum() - intersection

    with np.errstate(divide="ignore", invalid="ignore"):
        jaccard_similarities = intersection / union
        jaccard_similarities = jaccard_similarities.A1
        jaccard_similarities = np.nan_to_num(jaccard_similarities)

    max_similarity = jaccard_similarities.max()
    best_match_idx = jaccard_similarities.argmax()
    best_match_original = data_original_series.iloc[best_match_idx]

    return {
        "MATCH": match_original,
        "BEST_FOUND_MATCH": best_match_original,
        "TRUE_MATCH": None,
        "BEST_MATCH": max_similarity,
    }

In [None]:
TEST_FLAG = False
DATA_COLUMN = "STREET"
MATCH_COLUMN = "STREET"
MATCH_FILE_NAME = "847ff2869e9fa08110422d98fe15553a1931fc8d59876977b3ab0f45.csv"
MATCH_FILE = RAW_DIR.joinpath(MATCH_FILE_NAME)

DATA, MATCH = get_data(TEST_FLAG, MATCH_FILE)

DATA.reset_index(drop=True, inplace=True)
MATCH.reset_index(drop=True, inplace=True)

In [None]:
# File specific code
...
DATA = DATA.assign(STREET=DATA["STREET_NAME"] + " " + DATA["STREET_NO"])
DATA

In [None]:
DATA[DATA_COLUMN].fillna("", inplace=True)
data_norm_series = DATA[DATA_COLUMN].apply(normalize_text)
data_original_series = DATA[DATA_COLUMN]

MATCH[MATCH_COLUMN].fillna("", inplace=True)
match_norm_series = MATCH[MATCH_COLUMN].apply(normalize_text)
match_original_series = MATCH[MATCH_COLUMN]

In [None]:
vectorizer = CountVectorizer(binary=True)
indices = MATCH.index.tolist()

In [None]:
vectorizer.fit(data_norm_series)
data_vectors = vectorizer.transform(data_norm_series)

In [None]:
def perform_matching():
    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(calculate_best_match, idx): idx for idx in indices}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Matching", unit="match"):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Exception occurred for index {futures[future]}: {e}")
        return results

In [None]:
prof = Profiler(name=f"{MATCH_FILE_NAME[:13]}_jaccard")
prof.enable()
results = perform_matching()
prof.disable()
prof.save_show_profile()

In [None]:
results_df = pd.DataFrame(results)
results_df["BEST_MATCH_BINARY"] = results_df["BEST_MATCH"] >= THRESHOLDS.jaccard
export_data_for_validation(results_df, MATCH_FILE_NAME, DATA_COLUMN, "JACCARD")