In [None]:
#%pip install splink


In [None]:
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_level_library as cll
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.blocking_rule_library as brl
from splink.comparison import Comparison
from splink.duckdb.linker import DuckDBLinker

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

spark.sparkContext.setCheckpointDir("Files/tmp_checkpoints")

import os

os.makedirs("Files/TempReports", exist_ok=True)

In [None]:
from splink.datasets import splink_datasets

df = splink_datasets.historical_50k

df[:1000]

In [None]:
settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        brl.block_on(["substr(dob,1,4)","postcode_fake"]),
        brl.block_on(["dob","lower(substr(first_name,1,1))"]),
        brl.block_on(["lower(surname)","postcode_fake"]),
        brl.block_on(["lower(surname)","lower(first_name)"]),
    ],
    "comparisons": [
        ctl.name_comparison(
            "first_name",
            set_to_lowercase = True,
            include_exact_match_level=True,
            damerau_levenshtein_thresholds=[1,2],
            jaro_winkler_thresholds=[0.9, 0.8],
            term_frequency_adjustments=False
            ),
        ctl.name_comparison(
            "surname",
            set_to_lowercase = True,
            include_exact_match_level=True,
            damerau_levenshtein_thresholds=[1,2],
            jaro_winkler_thresholds=[0.9, 0.8],
            term_frequency_adjustments=False
            ),
        ctl.date_comparison("dob", 
            cast_strings_to_date=True,
            invalid_dates_as_null=True,
            levenshtein_thresholds=[2],
            damerau_levenshtein_thresholds=[],
            datediff_thresholds=[1, 1],
            datediff_metrics=["month", "year"],
            ),
        cl.exact_match("birth_place"),
        ctl.postcode_comparison("postcode_fake", set_to_lowercase = True),
        cl.exact_match("gender"),
        cl.exact_match("occupation")
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    #"additional_columns_to_retain": ["Street","Locality","Town","County"]
    "em_convergence": 0.001
}

In [None]:
linker = DuckDBLinker(df, settings)

linker.profile_columns(
    ["first_name", "surname", "postcode_fake", "substr(dob, 1,4)"], top_n=10, bottom_n=5
)
    

In [None]:
linker.cumulative_num_comparisons_from_blocking_rules_chart()

In [None]:
deterministic_rules = [
    "l.first_name = r.first_name and l.surname = r.surname and l.dob = r.dob",
    "l.first_name = r.first_name and l.surname = r.surname and l.postcode_fake = r.postcode_fake"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.70)

In [None]:
linker.estimate_u_using_random_sampling(max_pairs=1e7)

In [None]:
linker.estimate_m_from_label_column("cluster")

In [None]:
training_blocking_rule = brl.block_on(["surname","first_name","dob"])
training_session_1 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

training_blocking_rule = brl.block_on(["surname","first_name","postcode_fake"])
training_session_2 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

training_blocking_rule = brl.block_on(["first_name","postcode_fake","dob"])
training_session_3 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

training_blocking_rule = brl.block_on(["surname","postcode_fake","dob"])
training_session_4 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

In [None]:
training_blocking_rule = brl.block_on(["first_name","occupation"])
training_session_6 = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)

In [None]:
linker.match_weights_chart()

In [None]:
results = linker.predict(threshold_match_probability=0.5)

In [None]:
display(results.as_pandas_dataframe(limit=1000))

In [None]:

linker.comparison_viewer_dashboard(results, "Files/TempReports/comparisons.html", overwrite=True)

f = open('Files/TempReports/comparisons.html', 'r')

displayHTML(f.read())


In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(results, threshold_match_probability=0.9)

In [None]:
display(clusters.as_pandas_dataframe(limit=1000).sort_values('cluster_id'))