This collection of scenarios demonstrates how to solve various data quality problems by exploiting patterns found (or validated) by Desbordante.

In this scenario, we showcase a simple application that performs typo detection in a table.

The idea of this scenario is described in the paper "Solving Data Quality Problems with Desbordante: a Demo" by G. Chernishev et al., available at https://arxiv.org/abs/2307.14935. There is also an interactive demo at https://desbordante.streamlit.app/.

# Typo mining example using Desbordante algorithms.

In [None]:
!pip install desbordante==2.3.2
!wget https://raw.githubusercontent.com/Desbordante/desbordante-core/refs/heads/main/examples/datasets/Workshop.csv
!pip install colorama jellyfish

from functools import reduce
from itertools import groupby, islice
import operator

from colorama import Style, Fore
from jellyfish import levenshtein_distance
import desbordante
import pandas

def setup_pandas_print():
    pandas.set_option('display.max_columns', None)
    pandas.set_option('display.width', None)
    pandas.set_option('display.max_colwidth', None)
    pandas.set_option('display.expand_frame_repr', False)

setup_pandas_print()

Collecting desbordante==2.3.2
  Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: desbordante
Successfully installed desbordante-2.3.2
--2025-03-20 17:45:38--  https://raw.githubusercontent.com/Desbordante/desbordante-core/refs/heads/main/examples/datasets/Workshop.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 95017 (93K) [text/plain]
Saving to: ‘Workshop.csv’


2025-03-20 17:45:39 (10.1 MB/s) - ‘Workshop.csv’ saved [95017/95017]

Collecting colorama

## Setting up various algorithm parameters.

In [None]:
# Value cluster filtering parameters.
RADIUS = 3
RATIO = 0.1

# Algorithm that finds exact FDs and its config.
EXACT_ALGORITHM_TYPE = desbordante.fd.algorithms.Default
EXACT_ALGO_CONFIG = {}

# Algorithm that finds approximate FDs and its config.
APPROXIMATE_ALGORITHM_TYPE = desbordante.afd.algorithms.Default
ERROR = 0.005 # Highest error for almost holding FDs.
APPROXIMATE_ALGO_CONFIG = {'error': ERROR}

# Parameters for pandas.read_csv(...).
DATASET_PATH = 'Workshop.csv'
HEADER = 0
SEPARATOR = ','

# Index of the almost holding FD. Chosen in advance purely for
# demonstration purposes. In a real usage scenario this should be a
# choice for the user.
FD_INDEX = 2


assert APPROXIMATE_ALGO_CONFIG['error'] > 0.0, 'Typo mining relies on non-zero error'
assert EXACT_ALGO_CONFIG.get('error', 0.0) == 0.0, 'Error must be 0 for precise algorithm'

# Variables to simplify the configuration string construction below.
EXACT_ALGORITHM = EXACT_ALGORITHM_TYPE.__name__
APPROXIMATE_ALGORITHM = APPROXIMATE_ALGORITHM_TYPE.__name__

# A message containing all variables used by this usage scenario, to be
# displayed to the user.
CONFIG_STRING = f"""Starting typo discovery scenario with parameters:
{RADIUS=}
{RATIO=}
{ERROR=}
{DATASET_PATH=}
{EXACT_ALGORITHM=}
{APPROXIMATE_ALGORITHM=}
{HEADER=}
{SEPARATOR=}"""

## Defining necessary functions.

In [None]:
def get_squashed_sorted_clusters(dataset: pandas.DataFrame, lhs_indices, rhs_index):
    def get_lhs(row_count_pair):
        row, _ = row_count_pair
        return row[:-1]

    def get_rhs(row):
        return row[-1]

    def count_key(rhs_count_pair):
        rhs, count = rhs_count_pair
        # Sort descending on count, ascending on rhs value
        return -count, rhs

    fd_columns = [dataset.columns[col_num] for col_num in lhs_indices]
    fd_columns.append(dataset.columns[rhs_index])
    value_counts = dataset.value_counts(fd_columns, dropna=False)
    # Rows with the same LHS now end up next to each other and can be
    # grouped together with groupby. But inside each group rows may not
    # be sorted by the number of their occurrences.
    value_counts.sort_index(inplace=True)
    lhs_groups = ((lhs, row_count_pairs) for lhs, row_count_iter in
                  groupby(value_counts.items(), key=get_lhs) if
                  # Exclude instances where FD is not violated.
                  len(row_count_pairs := tuple(row_count_iter)) > 1)
    # The final step is transforming lhs groups in the form of
    # (lhs, (((*lhs, rhs_value), count), ...)) to the form
    # (lhs, ((rhs_value, count), ...)) and sorting them by the number
    # of occurrences in the table.
    return [(lhs, sorted(((get_rhs(row), count) for row, count in row_count_pairs), key=count_key))
            for lhs, row_count_pairs in lhs_groups]

def number_metric(a, b):
    return abs(a - b)


def string_metric(a, b):
    return levenshtein_distance(str(a), str(b))


def filter_radius(squashed_sorted_clusters, metric) -> list:
    def is_value_close(value_count_pair):
        value, _ = value_count_pair
        return metric(most_common_value, value) < RADIUS

    filtered = []
    for lhs, value_data in squashed_sorted_clusters:
        most_common_value, _ = value_data[0]
        close_value_pairs = list(filter(is_value_close, islice(value_data, 1, None)))
        if close_value_pairs:
            filtered.append((lhs, [value_data[0]] + close_value_pairs))
    return filtered

def filter_ratio(squashed_sorted_clusters):
    def few_deviations(squashed_sorted_cluster):
        _, value_info = squashed_sorted_cluster
        _, most_common_count = value_info[0]
        total_values = sum(number for _, number in value_info)
        deviating_values = total_values - most_common_count
        return deviating_values / total_values < RATIO

    return list(filter(few_deviations, squashed_sorted_clusters))


def filter_squashed_sorted_clusters(squashed_sorted_clusters):
    try:
        squashed_sorted_clusters = filter_radius(squashed_sorted_clusters, number_metric)
    except TypeError:
        squashed_sorted_clusters = filter_radius(squashed_sorted_clusters, string_metric)
    return filter_ratio(squashed_sorted_clusters)


def get_result_set(df, algo_type, algo_config):
    algo = algo_type()
    algo.load_data(table=df, **algo_config)
    algo.execute(**algo_config)
    return set(algo.get_fds())


def make_display_df(squashed_sorted_clusters, original_df, lhs_indices, rhs_index):
    display_rows = []
    for lhs, value_info in squashed_sorted_clusters:
        for value, count in value_info:
            display_rows.append((count, *lhs, value))
    return pandas.DataFrame(display_rows, columns=['rows count']
                            + [original_df.columns[col] for col in lhs_indices]
                            + [original_df.columns[rhs_index]])

def print_display_df(display_df):
    df_lines = display_df.to_string(index=False).splitlines()
    print(df_lines[0])
    print(Fore.GREEN + df_lines[1] + Style.RESET_ALL)
    print(Fore.RED + '\n'.join(islice(df_lines, 2, None)) + Style.RESET_ALL)
    print()

def get_typo_candidates_df(df, display_df):
    def get_mask(attr_info):
        col_name, value = attr_info
        return df[col_name] == value

    typo_candidate_rows = []
    typo_candidate_row_indices = []

    for index, row in display_df.iterrows():
        mask = reduce(operator.and_, map(get_mask, islice(row.items(), 1, None)))
        found_rows = df[mask]
        typo_candidate_rows.append(found_rows.values[0])
        typo_candidate_row_indices.append(found_rows.index.values[0])
    return pandas.DataFrame(typo_candidate_rows, columns=df.columns, index=typo_candidate_row_indices)

## Printing dataset sample.

In [None]:
df = pandas.read_csv(DATASET_PATH, sep=SEPARATOR, header=HEADER)
print(df)


                                       id      worker_name supervisor_surname       workshop  salary                   job_post
0    404f50cb-caf0-4974-97f9-9463434537e1   Jennifer Moore        Galen Calla    Yogatacular     980    Client Solution Analyst
1    b5e38281-9c09-49bf-91f5-c55397df4d43       Edward Lee      Carrie Silvia    MonsterWorq     905  Front-End Loader Operator
2    972b299d-2f27-4d6d-81d2-8effbc543bf1        Brian Lee      Shena Desiree  Talkspiration     700             Farm Assistant
3    3241fb48-5a15-4638-bd68-d915834a3f89   Kenneth Turner        Paul Jeffry     Verbalthon     980    Client Solution Analyst
4    9cbb9026-f157-4a01-aace-a42b05ab2a28   Betty Campbell    Addyson Aaliyah     SpeakerAce     800            Physiotherapist
..                                    ...              ...                ...            ...     ...                        ...
940  9cd700bc-b3d9-439d-afe9-945c2a20bc37    Richard Lopez        Galen Calla    Yogatacular     845   S

## Starting typo discovery scenario with parameters:

In [None]:
print(CONFIG_STRING)

Starting typo discovery scenario with parameters:
RADIUS=3
RATIO=0.1
ERROR=0.005
DATASET_PATH='Workshop.csv'
EXACT_ALGORITHM='HyFD'
APPROXIMATE_ALGORITHM='Pyro'
HEADER=0
SEPARATOR=','


## Searching for almost holding FDs.


In [None]:
print('Searching for almost holding FDs...')
print()
holding_fds = get_result_set(df, EXACT_ALGORITHM_TYPE, EXACT_ALGO_CONFIG)
close_fds = get_result_set(df, APPROXIMATE_ALGORITHM_TYPE, APPROXIMATE_ALGO_CONFIG)
almost_holding_fds = sorted(close_fds - holding_fds, key=lambda fd: fd.to_index_tuple())
print('Found! Almost holding FDs:')
print('\n'.join(map(str, almost_holding_fds)))
print()

Searching for almost holding FDs...

Found! Almost holding FDs:
[supervisor_surname salary] -> job_post
[supervisor_surname job_post] -> salary
[workshop] -> supervisor_surname
[workshop salary] -> job_post
[workshop job_post] -> salary



## Selecting FD with index 2.

In [None]:
print(f'Selecting FD with index {FD_INDEX}:')
lhs_indices, rhs_index = almost_holding_fds[FD_INDEX].to_index_tuple()
squashed_sorted_clusters = filter_squashed_sorted_clusters(
    get_squashed_sorted_clusters(df, lhs_indices, rhs_index))
if not squashed_sorted_clusters:
    print('Nothing found. Try another FD or relax restrictions (radius, ratio, error).')
else:
    display_df = make_display_df(squashed_sorted_clusters, df, lhs_indices, rhs_index)
    print_display_df(display_df)
    print('Typo candidates and context:')
    print(get_typo_candidates_df(df, display_df).to_string())

Selecting FD with index 2:
 rows count    workshop supervisor_surname
[32m        198 Yogatacular        Galen Calla[0m
[31m          1 Yogatacular      Galen Calella[0m

Typo candidates and context:
                                     id       worker_name supervisor_surname     workshop  salary                 job_post
0  404f50cb-caf0-4974-97f9-9463434537e1    Jennifer Moore        Galen Calla  Yogatacular     980  Client Solution Analyst
7  ddba9118-ec89-472d-9f3f-bebd919f0e3a  William Robinson      Galen Calella  Yogatacular     975            Store Manager
