This collection of scenarios demonstrates how to solve various data quality problems by exploiting patterns found (or validated) by Desbordante.

In this scenario, we showcase a simple application that performs data deduplication in a table.

The idea of this scenario is described in the paper "Solving Data Quality Problems with Desbordante: a Demo" by G. Chernishev et al., available at https://arxiv.org/abs/2307.14935. There is also an interactive demo at https://desbordante.streamlit.app/.

# Data deduplication example using Desbordante algorithms.

In [None]:
!pip install desbordante==2.3.2
!wget https://raw.githubusercontent.com/Desbordante/desbordante-core/refs/heads/main/examples/datasets/duplicates.csv


from collections import defaultdict, deque

import desbordante
import pandas


def setup_pandas_print():
    pandas.set_option('display.max_columns', None)
    pandas.set_option('display.width', None)
    pandas.set_option('display.max_colwidth', None)
    pandas.set_option('display.expand_frame_repr', False)

setup_pandas_print()

Collecting desbordante==2.3.2
  Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading desbordante-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: desbordante
Successfully installed desbordante-2.3.2
--2025-03-20 18:05:28--  https://raw.githubusercontent.com/Desbordante/desbordante-core/refs/heads/main/examples/datasets/duplicates.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4466 (4.4K) [text/plain]
Saving to: ‘duplicates.csv’


2025-03-20 18:05:29 (42.6 MB/s) - ‘duplicates.csv’ saved [4466/4466]



## Setting up various algorithm parameters.

In [None]:
# Algorithm that finds approximate FDs and its config
ALGORITHM_TYPE = desbordante.afd.algorithms.Default
ERROR = 0.001
CONFIG = {'error': ERROR, 'max_lhs': 1}

# Parameters for pandas.read_csv(...).
DATASET_PATH = 'duplicates.csv'
HEADER = 0
SEPARATOR = ','

# File where the deduplicated dataset will be written.
OUTPUT_FILE = 'output.csv'

# Initial window size in sorted neighborhood method.
INITIAL_WINDOW_SIZE = 4

# Variable to simplify the configuration string construction below.
ALGORITHM = ALGORITHM_TYPE.__name__

# A message containing all variables used by this usage scenario, to be
# displayed to the user.
CONFIG_STRING = f"""Deduplication parameters:
{ALGORITHM=}
{ERROR=:.5f}
{DATASET_PATH=}
{SEPARATOR=}
{INITIAL_WINDOW_SIZE=}"""

## Defining necessary functions.

In [None]:
def get_1lhs_fds(df, algo_type, algo_config):
    algo = algo_type()
    algo.load_data(table=df, **algo_config)
    algo.execute(**algo_config)
    return sorted((lhs_indices[0], fd.rhs_index) for fd in algo.get_fds()
                  if len(lhs_indices := fd.lhs_indices) == 1)


def get_lhs_from_sorted_fds(fds):
    lhs = []
    prev_lhs = None
    for cur_lhs, _ in fds:
        if cur_lhs != prev_lhs:
            lhs.append(cur_lhs)
        prev_lhs = cur_lhs
    return lhs


def count_matches(row1, row2, rhs: list[int]):
    return sum(row1[index] == row2[index] for index in rhs)


def print_fd_info(df: pandas.DataFrame, fds: list[tuple[int, int]]):
    fd_dict = defaultdict(list)
    for lhs, rhs in fds:
        fd_dict[lhs].append(df.columns[rhs])
    print('AFD info:')
    print('\n'.join(f'{lhs}: {df.columns[lhs]} -> ( {" ".join(fd_dict[lhs])} )'
                    for lhs in get_lhs_from_sorted_fds(fds)))


def keepall_handler(df, new_rows, remaining_rows, used_rows):
    new_rows.extend(df.iloc[list(remaining_rows)].itertuples(index=False))
    remaining_rows.clear()


def drop_handler(df, new_rows, remaining_rows, used_rows):
    indices_to_add = list(remaining_rows - used_rows)
    new_rows.extend(df.iloc[indices_to_add].itertuples(index=False))
    remaining_rows.clear()


def choose_index(col_name, distinct_values):
    print(f'Column: {col_name}. Which value to use?')
    print('\n'.join(f'{i}: {value}' for i, value in enumerate(distinct_values)))
    return int(input('index: '))


def merge_handler(df: pandas.DataFrame, new_rows, remaining_rows, used_rows):
    if not used_rows:
        return
    new_row = []
    for col_name, values in zip(df.columns,
                                zip(*df.iloc[list(used_rows)].itertuples(index=False))):
        distinct_values = list(set(values))
        index = 0 if len(distinct_values) == 1 else choose_index(col_name, distinct_values)
        new_row.append(distinct_values[index])
    remaining_rows -= used_rows
    new_rows.append(new_row)


def unknown_handler(df, new_rows, remaining_rows, used_rows):
    print('Unknown command.')


def ask_rows(df: pandas.DataFrame, window: deque[tuple[int, object]]) -> list:
    commands = {
        'keepall': keepall_handler,
        'drop': drop_handler,
        'merge': merge_handler,
    }

    remaining_rows = {row_info[0] for row_info in window}
    new_rows = []
    while remaining_rows:
        print(df.iloc[sorted(remaining_rows)].to_string())
        command_args = input('Command: ').split()
        if not command_args:
            print('Please input a command!')
            continue
        command, *used_rows = command_args
        used_rows = {col_num for col in used_rows if (col_num := int(col)) in remaining_rows}
        commands.get(command, unknown_handler)(df, new_rows, remaining_rows, used_rows)
    return new_rows


def is_similar(row_info, window, chosen_cols, matches_required):
    return any(count_matches(prev_row_info[1], row_info[1], chosen_cols) >= matches_required
               for prev_row_info in window)


def get_deduped_rows(df: pandas.DataFrame, chosen_cols: list[int], matches_required: int,
                     fds: list[tuple[int, int]]):
    df.sort_values([df.columns[rhs_col] for _, rhs_col in fds if rhs_col in chosen_cols],
                   inplace=True)
    df.reset_index(inplace=True, drop=True)

    window = deque()
    new_rows = []
    has_duplicate = False
    for row_info in df.iterrows():
        if len(window) < INITIAL_WINDOW_SIZE:
            if not has_duplicate:
                has_duplicate = is_similar(row_info, window, chosen_cols, matches_required)
        elif not has_duplicate:
            new_rows.append(window.pop()[1].values)
            has_duplicate = is_similar(row_info, window, chosen_cols, matches_required)
        elif not is_similar(row_info, window, chosen_cols, matches_required):
            new_rows.extend(ask_rows(df, window))
            window.clear()
            has_duplicate = False
        window.appendleft(row_info)
    new_rows.extend(
        ask_rows(df, window) if has_duplicate else (row_info[1].values for row_info in window))
    return new_rows

## Printing dataset sample.

In [None]:
df = pandas.read_csv(DATASET_PATH, sep=SEPARATOR, header=HEADER, dtype=str, index_col=False)
print(df)


      id             name address       city                          email phone country
0   5996        Kaede Sue      66      Pirus       Kaede.Sue4422@virtex.rum    39      EU
1     36       Licia Wolf      35  Pilington       Licia.Wolf1260@cmail.com    35      CM
2     17        Steve Doe      16     Syndye           Steve.Doe272@muli.ry    16      GZ
3     62      Lisa Tarski      61     Syndye     Lisa.Tarski3782@virtex.rum    61      JU
4      6      Mary Tarski       5     Lumdum       Mary.Tarski30@ferser.edu     5      PR
..   ...              ...     ...        ...                            ...   ...     ...
73    15        Ivan Dawn      14     Syndye      Ivan.Dawn210@atomlema.ocg    14      FC
74  5993       Lisa Honjo      63       Roit      Lisa.Honjo4032@virtex.rum    63      AI
75    59         Lisa Sue      58     Muxicu         Lisa.Sue3422@cmail.com    58      AI
76    21  Steve Shiramine      20  Pilington  Steve.Shiramine420@ferser.edu    20      GZ
77    44  

## Starting deduplication scenario with parameters:

In [None]:
print(CONFIG_STRING)

Deduplication parameters:
ALGORITHM='Pyro'
ERROR=0.00100
DATASET_PATH='duplicates.csv'
SEPARATOR=','
INITIAL_WINDOW_SIZE=4


In [None]:
print(f'Original records: {len(df)}')
print()

fds = get_1lhs_fds(df, ALGORITHM_TYPE, CONFIG)
print_fd_info(df, fds)
lhs_column = int(input('LHS column index: '))
fds = list(filter(lambda fd: fd[0] == lhs_column, fds))
if not fds:
    print('No FDs with this LHS!')
else:
    print('RHS columns:')
    print('\n'.join(f'{rhs}: {df.columns[rhs]}' for _, rhs in fds))
    chosen_cols = sorted(set(map(int, input('RHS columns to use (indices): ').split())))
    matches_required = int(input('Equal columns to consider duplicates: '))

    new_rows = get_deduped_rows(df, chosen_cols, matches_required, fds)
    print()

    print(f'Resulting records: {len(new_rows)}. Duplicates found: {len(df) - len(new_rows)}')
    new_df = pandas.DataFrame(new_rows, columns=df.columns)

    print(new_df)
    new_df.to_csv(OUTPUT_FILE, index=False)

Original records: 78

AFD info:
0: id -> ( name address city email phone country )
2: address -> ( name )
4: email -> ( name address phone country )
5: phone -> ( name )
LHS column index: 2
RHS columns:
1: name
RHS columns to use (indices): 1
Equal columns to consider duplicates: 1


  return sum(row1[index] == row2[index] for index in rhs)


      id          name address      city                       email phone country
5     27     Björn Sue      26      Roit      Björn.Sue702@cmail.com    26      CM
6     30  Björn Tarski      29    Lumdum  Björn.Tarski870@ferser.edu    29      PR
7  11886    Björn Wolf      28  Kustruma    Björn.Wolf756@virtex.rum    27      AI
8   5957    Björn Wolf      27       NaN    Björn.Wolf756@virtex.rum    27      AI
9     28    Björn Wolf      27       NaN    Björn.Wolf756@virtex.rum    27      AI
Command: drop 8


  return sum(row1[index] == row2[index] for index in rhs)


      id        name address       city                       email phone country
42    63   Lisa Dawn      62  Pilington  Lisa.Dawn3906@atomlema.ocg    62      EU
43    57    Lisa Doe      56       Roit     Lisa.Doe3192@virtex.rum    56      AI
44  5993  Lisa Honjo      63       Roit   Lisa.Honjo4032@virtex.rum    63      AI
45    64  Lisa Honjo      63      Pirus   Lisa.Honjo4032@virtex.rum    63      AI
Command: keepall


  return sum(row1[index] == row2[index] for index in rhs)


       id       name address      city                        email phone country
50     60  Lisa Wolf      59    Syndye      Lisa.Wolf3540@cmail.com    59      FC
51      7  Mary Dawn       6    Syndye     Mary.Dawn42@atomlema.ocg     6      PR
52  11859   Mary Doe     NaN    Lumdum     Mary.Doe-5926@ferser.edu     0      EU
53      1   Mary Doe     NaN    Lumdum            Mary.Doe0@muli.ry     4      EU
54  17788   Mary Doe       0  Kustruma  Mary.Doe35099692@virtex.rum     0      EU
55   5930   Mary Doe     NaN    Lumdum     Mary.Doe-5926@ferser.edu     0     NaN
Command: merge 52 55
Column: id. Which value to use?
0: 11859
1: 5930
index: 1
Column: country. Which value to use?
0: nan
1: EU
index: 1
       id       name address      city                        email phone country
50     60  Lisa Wolf      59    Syndye      Lisa.Wolf3540@cmail.com    59      FC
51      7  Mary Dawn       6    Syndye     Mary.Dawn42@atomlema.ocg     6      PR
53      1   Mary Doe     NaN    Lumdum    

  return sum(row1[index] == row2[index] for index in rhs)
