## 3.2 Anonymising your dataset
Goals: The goal of k-anonymity is to modify a dataset such that any given record cannot be distinguished from at least k−1 other records regarding certain "quasi-identifier" attributes.

quasi-identifiers: 'region', 'gender', 'age', 'height', 'weight', 'eat', 'schedule', 'howlong'

In [52]:
def get_spans(df, partition, scale=None):
    """
    Calculates and returns the spans (range of values) for each column in a specified partition of a dataframe, with
    an option to scale these spans by provided values.

    :param        df: the dataframe for which to calculate the spans
    :param partition: the partition for which to calculate the spans
    :param     scale: if given, the spans of each column will be divided
                      by the value in scale for that column
    :returns        : The spans of all columns in the partition
    """
    spans = {}
    for feature_column in quasi_identifiers:
        if feature_column in categorical:
            span = len(df[feature_column][partition].unique())
        else:
            span = df[feature_column][partition].max() - df[feature_column][partition].min()
        if scale is not None:
            span = span / scale[feature_column]
        spans[feature_column] = span
    return spans

In [53]:
full_spans = get_spans(df, df.index)
full_spans

{'region': 18,
 'gender': 4,
 'howlong': 31,
 'eat': 48,
 'weight': 20174.0,
 'schedule': 135,
 'age': 112.0,
 'height': 8388607.0}

In [54]:
def split(df, partition, column):
    """
    Divides a specified partition of a dataframe into two parts based on the median or unique values of a given column,
    returning a tuple with the indices of these two parts.

    :param        df: The dataframe to split
    :param partition: The partition to split
    :param    column: The column along which to split
    :returns        : A tuple containing a split of the original partition
    """
    dfp = df[column][partition]
    if column in categorical:
        values = dfp.unique()
        lv = set(values[:len(values) // 2])
        rv = set(values[len(values) // 2:])
        return dfp.index[dfp.isin(lv)], dfp.index[dfp.isin(rv)]
    else:
        median = dfp.median()
        dfl = dfp.index[dfp < median]
        dfr = dfp.index[dfp >= median]
        return (dfl, dfr)

In [55]:
def is_k_anonymous(df, partition, sensitive_column, k=3):
    """
    Checks if a partition is k-anonymous by comparing its amount of entries with the required (k).

    :param               df: The dataframe on which to check the partition.
    :param        partition: The partition of the dataframe to check.
    :param sensitive_column: The name of the sensitive column
    :param                k: The desired k
    :returns               : True if the partition is valid according to our k-anonymity criteria, False otherwise.
    """
    if len(partition) < k:
        return False
    return True

def partition_dataset(df, feature_columns, sensitive_column, scale, is_valid):
    """
    Partitions a dataframe into valid subsets based on specified feature columns, a sensitive column, and span scales,
    using a validity function to ensure each partition meets certain criteria.

    :param               df: The dataframe to be partitioned.
    :param  feature_columns: A list of column names along which to partition the dataset.
    :param sensitive_column: The name of the sensitive column (to be passed on to the is_valid function)
    :param            scale: The column spans as generated before.
    :param         is_valid: A function that takes a dataframe and a partition and returns True if the partition is valid.
    :returns               : A list of valid partitions that cover the entire dataframe.
    """
    finished_partitions = []
    partitions = [df.index]
    while partitions:
        partition = partitions.pop(0)
        spans = get_spans(df[feature_columns], partition, scale)
        for column, span in sorted(spans.items(), key=lambda x: -x[1]):
            lp, rp = split(df, partition, column)
            if not is_valid(df, lp, sensitive_column) or not is_valid(df, rp, sensitive_column):
                continue
            partitions.extend((lp, rp))
            break
        else:
            finished_partitions.append(partition)
    return finished_partitions

In [56]:
def agg_categorical_column(series):
    """
    Aggregates the values of a series with categorical values by concatenating them.

    :param           series: A series of categorical values that need to be aggregated.
    :returns               : A string with all the values in the series joined with a ',' (comma).
    """
    series = set(series.astype(str))
    return [','.join(series)]

KeyError: 'region'

In [None]:
def agg_numerical_column(series):
    """
    Aggregates the values of a series with numerical values by taking their mean.

    :param           series: A series of numerical values that need to be aggregated.
    :returns               : Mean value of the values in the series.
    """
    print(series)
    return [series.mean()]