In [None]:
import pandas as pd
import numpy as np

from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer

from config.config import THRESHOLDS, RAW_DIR, MAX_WORKERS
from config.profiler import Profiler
from utils.io import export_data_for_validation
from utils.format import normalize_text
from scripts.datahandler import get_data

In [None]:
match_ngrams_matrix = None
match_original_series = None
data_ngrams_matrix = None
data_original_series = None
n = 2  # Always bigram for dice coefficient

In [None]:
def dice_coefficient(match_vector, data_vectors):
    intersection = data_vectors.multiply(match_vector).sum(axis=1)

    match_size = match_vector.sum()
    data_sizes = data_vectors.sum(axis=1)

    numerator = 2 * intersection
    denominator = match_size + data_sizes

    with np.errstate(divide="ignore", invalid="ignore"):
        dice_coefficients = numerator / denominator
        dice_coefficients = np.nan_to_num(dice_coefficients)

    return np.asarray(dice_coefficients).flatten()

def calculate_best_match(idx):
    match_original = match_original_series.iloc[idx]
    match_vector = match_ngrams_matrix[idx]

    if match_vector.nnz == 0:
        return {
            "MATCH": match_original,
            "BEST_FOUND_MATCH": None,
            "TRUE_MATCH": None,
            "BEST_MATCH": None,
        }

    dice_coefficients = dice_coefficient(match_vector, data_ngrams_matrix)
    max_similarity_idx = dice_coefficients.argmax()
    max_similarity = dice_coefficients[max_similarity_idx]
    best_match_original = data_original_series.iloc[max_similarity_idx]

    return {
        "MATCH": match_original,
        "BEST_FOUND_MATCH": best_match_original,
        "TRUE_MATCH": None,
        "BEST_MATCH": max_similarity,
    }

In [None]:
TEST_FLAG = False
DATA_COLUMN = "FULLNAME"
MATCH_COLUMN = "FULLNAME"
MATCH_FILE_NAME = "847ff2869e9fa08110422d98fe15553a1931fc8d59876977b3ab0f45.csv"
MATCH_FILE = RAW_DIR.joinpath(MATCH_FILE_NAME)

DATA, MATCH = get_data(TEST_FLAG, MATCH_FILE)

DATA.reset_index(drop=True, inplace=True)
MATCH.reset_index(drop=True, inplace=True)

In [None]:
# File specific code
...
# DATA = DATA.assign(STREET=DATA["STREET_NAME"] + " " + DATA["STREET_NO"])
DATA = DATA.assign(FULLNAME=DATA["FIRSTNAME"] + " " + DATA["LASTNAME"])

In [None]:
DATA

In [None]:
DATA[DATA_COLUMN].fillna("", inplace=True)
data_norm_series = DATA[DATA_COLUMN].apply(normalize_text)
data_original_series = DATA[DATA_COLUMN]

MATCH[MATCH_COLUMN].fillna("", inplace=True)
match_norm_series = MATCH[MATCH_COLUMN].apply(normalize_text)
match_original_series = MATCH[MATCH_COLUMN]

In [None]:
vectorizer = CountVectorizer(analyzer="char", ngram_range=(n, n), binary=False)
indices = MATCH.index.tolist()

In [None]:
all_texts = pd.concat([data_norm_series, match_norm_series])
vectorizer.fit(all_texts)

data_ngrams_matrix = vectorizer.transform(data_norm_series)
match_ngrams_matrix = vectorizer.transform(match_norm_series)

In [None]:
def perform_matching():
    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(calculate_best_match, idx): idx for idx in indices}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Matching", unit="match"):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Exception occurred for index {futures[future]}: {e}")
    return results

In [None]:
prof = Profiler(name=f"{MATCH_FILE_NAME[:13]}_dice")
prof.enable()
results = perform_matching()
prof.disable()
prof.save_show_profile()

In [None]:
results_df = pd.DataFrame(results)
results_df["BEST_MATCH_BINARY"] = results_df["BEST_MATCH"] >= THRESHOLDS.dice
export_data_for_validation(results_df, MATCH_FILE_NAME, DATA_COLUMN, "DICE")