In [1]:
# Uncomment and run this cell if you're running in Google Colab.
# !pip install ipywidgets
# !pip install splink
# !jupyter nbextension enable --py widgetsnbextension

In [8]:
import splink.comparison_library as cl
from splink.internals.testing import comparison_vector_value
from splink import DuckDBAPI

db_api = DuckDBAPI()

comparison = cl.ExactMatch("first_name")

comparison_vector_value(comparison, {"first_name_l": "Robin", "first_name_r": "Robin"}, db_api)

{'comparison_vector_value': 1, 'label_for_charts': 'Exact match on first_name'}

In [4]:
import splink.comparison_library as cl
from splink.internals.testing import comparison_vector_value
from splink import DuckDBAPI
import ipywidgets as widgets
from IPython.display import display, HTML, Markdown

def create_comparison_playground(column_name):
    db_api = DuckDBAPI()

    comparison_types = [
        'ExactMatch', 'LevenshteinAtThresholds', 'JaroAtThresholds',
        'JaroWinklerAtThresholds', 'DamerauLevenshteinAtThresholds',
        'CosineSimilarityAtThresholds', 'JaccardAtThresholds',
        'AbsoluteDateDifferenceAtThresholds', 'AbsoluteTimeDifferenceAtThresholds',
        'ArrayIntersectAtSizes', 'DateOfBirthComparison', 'DistanceFunctionAtThresholds',
        'DistanceInKMAtThresholds', 'EmailComparison', 'ForenameSurnameComparison',
        'NameComparison', 'PostcodeComparison'
    ]

    default_values = {
        'ExactMatch': ('john', 'jon'),
        'LevenshteinAtThresholds': ('smith', 'smyth'),
        'JaroAtThresholds': ('martha', 'matha'),
        'JaroWinklerAtThresholds': ('williams', 'willaims'),
        'DamerauLevenshteinAtThresholds': ('receive', 'recieve'),
        'CosineSimilarityAtThresholds': ('data science', 'science data'),
        'JaccardAtThresholds': ('python programming', 'programming python'),
        'AbsoluteDateDifferenceAtThresholds': ('2023-01-01', '2023-01-15'),
        'AbsoluteTimeDifferenceAtThresholds': ('12:00:00', '12:15:30'),
        'ArrayIntersectAtSizes': ('apple,banana,cherry', 'banana,cherry,date'),
        'DateOfBirthComparison': ('1990-05-15', '1990-05-16'),
        'DistanceFunctionAtThresholds': ('10', '15'),
        'DistanceInKMAtThresholds': ('51.5074,-0.1278', '51.5074,-0.1290'),
        'EmailComparison': ('john.doe@example.com', 'johndoe@example.com'),
        'ForenameSurnameComparison': ('John Doe', 'Jon Doe'),
        'NameComparison': ('Elizabeth Taylor', 'Elisabeth Taylor'),
        'PostcodeComparison': ('SW1A 1AA', 'SW1A 1AB')
    }

    docstrings = {}
    for comp_type in comparison_types:
        class_obj = getattr(cl, comp_type)
        init_doc = getattr(class_obj.__init__, '__doc__', None)
        docstrings[comp_type] = init_doc if init_doc else class_obj.__doc__

    def get_comparison(comp_type):
        if comp_type in ['DateOfBirthComparison', 'EmailComparison', 'ForenameSurnameComparison', 'NameComparison', 'PostcodeComparison']:
            return getattr(cl, comp_type)()
        elif comp_type == 'DistanceFunctionAtThresholds':
            return cl.DistanceFunctionAtThresholds(column_name, lambda x, y: abs(float(x) - float(y)))
        elif comp_type == 'ArrayIntersectAtSizes':
            return cl.ArrayIntersectAtSizes(column_name, [1, 2, 3])
        else:
            return getattr(cl, comp_type)(column_name)

    def run_comparison(change):
        left_value = left_input.value
        right_value = right_input.value
        comparison = get_comparison(comparison_select.value)

        if comparison_select.value == 'ArrayIntersectAtSizes':
            left_value = left_value.split(',')
            right_value = right_value.split(',')

        result = comparison_vector_value(comparison, {f"{column_name}_l": left_value, f"{column_name}_r": right_value}, db_api)

        html_output = f"""
        <h3>Comparison Result:</h3>
        <p><strong>Comparison type:</strong> {comparison_select.value}</p>
        <p><strong>Comparison description:</strong> {result['label_for_charts']}</p>
        <p><strong>Comparison Vector Value:</strong> {result['comparison_vector_value']}</p>
        """

        docstring = docstrings.get(comparison_select.value, "No docstring available")
        markdown_docstring = f"{docstring}" if docstring else "No docstring available"

        output.clear_output()
        with output:
            display(HTML(html_output))
            display(Markdown("### Comparison Function Docstring:"))
            display(Markdown(markdown_docstring))

    def on_comparison_change(change):
        new_value = change['new']
        left_value, right_value = default_values.get(new_value, ('', ''))
        left_input.value = left_value
        right_input.value = right_value

    comparison_select = widgets.Dropdown(
        options=comparison_types,
        value='ExactMatch',
        description='Comparison:',
    )
    left_input = widgets.Text(description=f"{column_name} Left:", value=default_values['ExactMatch'][0])
    right_input = widgets.Text(description=f"{column_name} Right:", value=default_values['ExactMatch'][1])
    output = widgets.Output()

    comparison_select.observe(on_comparison_change, names='value')
    for widget in (comparison_select, left_input, right_input):
        widget.observe(run_comparison, names='value')

    return widgets.VBox([comparison_select, left_input, right_input, output])

playground = create_comparison_playground("column")
display(playground)

VBox(children=(Dropdown(description='Comparison:', options=('ExactMatch', 'LevenshteinAtThresholds', 'JaroAtTh…

In [13]:
dir(cl)

['AbsoluteDateDifferenceAtThresholds',
 'AbsoluteTimeDifferenceAtThresholds',
 'ArrayIntersectAtSizes',
 'CosineSimilarityAtThresholds',
 'CustomComparison',
 'DamerauLevenshteinAtThresholds',
 'DateOfBirthComparison',
 'DistanceFunctionAtThresholds',
 'DistanceInKMAtThresholds',
 'EmailComparison',
 'ExactMatch',
 'ForenameSurnameComparison',
 'JaccardAtThresholds',
 'JaroAtThresholds',
 'JaroWinklerAtThresholds',
 'LevenshteinAtThresholds',
 'NameComparison',
 'PostcodeComparison',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__']

In [26]:
import splink.internals.comparison_library as cl
comparison_types = [
    'ExactMatch', 'LevenshteinAtThresholds', 'JaroAtThresholds',
    'JaroWinklerAtThresholds', 'DamerauLevenshteinAtThresholds',
    'CosineSimilarityAtThresholds', 'JaccardAtThresholds',
    'AbsoluteDateDifferenceAtThresholds', 'AbsoluteTimeDifferenceAtThresholds',
    'ArrayIntersectAtSizes', 'DateOfBirthComparison', 'DistanceFunctionAtThresholds',
    'DistanceInKMAtThresholds', 'EmailComparison', 'ForenameSurnameComparison',
    'NameComparison', 'PostcodeComparison'
]

default_values = {
    'ExactMatch': ('john', 'jon'),
    'LevenshteinAtThresholds': ('smith', 'smyth'),
    'JaroAtThresholds': ('martha', 'matha'),
    'JaroWinklerAtThresholds': ('williams', 'willaims'),
    'DamerauLevenshteinAtThresholds': ('receive', 'recieve'),
    'CosineSimilarityAtThresholds': ('data science', 'science data'),
    'JaccardAtThresholds': ('python programming', 'programming python'),
    'AbsoluteDateDifferenceAtThresholds': ('2023-01-01', '2023-01-15'),
    'AbsoluteTimeDifferenceAtThresholds': ('12:00:00', '12:15:30'),
    'ArrayIntersectAtSizes': ('apple,banana,cherry', 'banana,cherry,date'),
    'DateOfBirthComparison': ('1990-05-15', '1990-05-16'),
    'DistanceFunctionAtThresholds': ('10', '15'),
    'DistanceInKMAtThresholds': ('51.5074,-0.1278', '51.5074,-0.1290'),
    'EmailComparison': ('john.doe@example.com', 'johndoe@example.com'),
    'ForenameSurnameComparison': ('John Doe', 'Jon Doe'),
    'NameComparison': ('Elizabeth Taylor', 'Elisabeth Taylor'),
    'PostcodeComparison': ('SW1A 1AA', 'SW1A 1AB')
}

docstrings = {comp_type: getattr(cl, comp_type).__doc__ for comp_type in comparison_types}
cl.JaccardAtThresholds.
