In [1]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [2]:
file1 = "adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
# Drop some columns and rows
# Keep only 1000 rows
df = df.loc[0:999,]
print(df.shape)
# Drop some columns
df = df.drop(columns = ["fnlwgt", "workclass", "education-num", "occupation", "marital-status", "relationship",
                         "race", "native-country"])
df.head()


(1000, 15)


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label
0,39,Bachelors,Male,2174,0,40,0
1,50,Bachelors,Male,0,0,13,0
2,38,HS-grad,Male,0,0,40,0
3,53,11th,Male,0,0,40,0
4,28,Bachelors,Female,0,0,40,0


In [14]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Avg columns
#avg_columns = ['capital-gain', 'capital-loss']

# Run tabular_anonymizer
# Set k
k = 5

# List of quasi-identifiers for checking k-anonymity
quasi_identifiers = ['age',	'education', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week']

print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(sensitive_columns, format_to_str=True)

# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(df, k=k)# anonymize_l_diversity(df, k=k, l=l)
df_anonymized.head()

Run tabular_anonymizer. Sensitive columns:  ['label'] , k= 5


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"Assoc-voc, Assoc-acdm, 10th",Female,0 - 0,0 - 0,15 - 35,0,7
1,17 - 36,"Assoc-voc, Assoc-acdm, 10th",Female,0 - 0,0 - 0,15 - 35,1,1
2,17 - 18,"11th, 12th",Male,0 - 2176,0 - 0,10 - 18,0,8
3,17 - 34,"11th, Masters, 12th, 10th",Male,0 - 7298,0 - 0,20 - 35,0,7
4,17 - 34,"11th, Masters, 12th, 10th",Male,0 - 7298,0 - 0,20 - 35,1,1


In [15]:
df_anonymized['label_count'].unique()

array([ 7,  1,  8,  4,  5,  6,  3,  2,  9, 10, 11, 17])

In [16]:
df_anonymized.sort_values(by='label_count', ascending=False)

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
144,26 - 28,"HS-grad, 9th",Male,0 - 0,0 - 1980,40 - 70,0,17
147,35 - 36,"HS-grad, 9th",Male,0 - 7298,0 - 0,40 - 60,0,11
143,24 - 25,"HS-grad, 9th",Male,0 - 4101,0 - 1721,40 - 60,0,11
149,23 - 24,"Some-college, Assoc-voc",Male,0 - 0,0 - 0,40 - 56,0,10
44,27 - 36,"Some-college, HS-grad, Bachelors",Female,0 - 0,0 - 0,32 - 38,0,10
...,...,...,...,...,...,...,...,...
194,31 - 33,Some-college,Male,0 - 0,0 - 0,40 - 50,1,1
69,58 - 76,"Some-college, HS-grad, Doctorate, Masters",Male,0 - 0,0 - 1816,1 - 15,1,1
67,50 - 55,"HS-grad, Bachelors",Male,0 - 0,0 - 0,30 - 38,1,1
62,38 - 43,"Masters, Bachelors",Male,0 - 0,0 - 0,24 - 38,1,1


In [31]:
l = 2

df_ldiv = p.anonymize_l_diversity(df, k=2, l=2)
df_ldiv.head()

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"11th, Some-college, HS-grad, Bachelors",Female,0 - 4064,0 - 1719,5 - 38,0,60
1,17 - 36,"11th, Some-college, HS-grad, Bachelors",Female,0 - 4064,0 - 1719,5 - 38,1,1
2,17 - 36,"Assoc-voc, Assoc-acdm, 10th",Female,0 - 0,0 - 0,15 - 35,0,7
3,17 - 36,"Assoc-voc, Assoc-acdm, 10th",Female,0 - 0,0 - 0,15 - 35,1,1
4,22 - 36,"Assoc-voc, 10th, 1st-4th, 5th-6th, Assoc-acdm,...",Female,0 - 2174,0 - 1669,40 - 70,0,18


In [32]:
df_ldiv.sort_values(by='label_count', ascending=False)

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
30,19 - 36,"HS-grad, 9th",Male,0 - 7688,0 - 1980,40 - 72,0,85
32,19 - 36,"Some-college, Assoc-voc",Male,0 - 5178,0 - 2377,40 - 70,0,62
0,17 - 36,"11th, Some-college, HS-grad, Bachelors",Female,0 - 4064,0 - 1719,5 - 38,0,60
6,17 - 36,"HS-grad, Assoc-voc, Bachelors, Assoc-acdm, Som...",Male,0 - 7298,0 - 1721,6 - 38,0,45
26,18 - 28,"11th, Some-college, HS-grad",Female,0 - 7298,0 - 1762,40 - 50,0,33
...,...,...,...,...,...,...,...,...
89,43 - 44,Masters,Female,0 - 0,0 - 0,40 - 50,1,1
91,37 - 41,HS-grad,Female,0 - 0,0 - 0,40 - 60,1,1
93,37 - 43,Some-college,Female,0 - 0,0 - 1887,40 - 50,1,1
95,45 - 47,Bachelors,Female,0 - 0,0 - 0,40 - 55,1,1


In [23]:
t = 1

df_tclose = p.anonymize_t_closeness(df, k=k, t=t)
df_tclose.head()

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"11th, Masters, HS-grad, Bachelors","Female, Male",0 - 15024,0 - 1980,5 - 72,0,248
1,17 - 36,"11th, Masters, HS-grad, Bachelors","Female, Male",0 - 15024,0 - 1980,5 - 72,1,44
2,17 - 58,"Assoc-voc, 10th, 5th-6th, Doctorate, 12th",Male,0 - 0,0 - 0,1 - 38,0,7
3,17 - 58,"Assoc-voc, 10th, 5th-6th, Doctorate, 12th",Male,0 - 0,0 - 0,1 - 38,1,2
4,19 - 79,Some-college,"Female, Male",0 - 15024,0 - 2415,2 - 80,0,186


t-closeness is broken in Anonypy!

In [11]:
count_max = df_anonymized['label_count'].max()
df_anonymized.loc[df_anonymized['label_count'] == count_max, :]

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
144,26 - 28,"HS-grad, 9th",Male,0 - 0,0 - 1980,40 - 70,0,17


In [None]:
# Function to check if a dataframe is k-anonymous. Works only with numeric and string data types, not lists.
def is_k_anonymous(df, k, quasi_identifiers, count_column, debug=False):
    for index, row in df.iterrows():
        # If row count < k, search for other identical rows with different sensitive attribute
        if row[count_column] < k:
            query = ''
            for col in quasi_identifiers:
                # numeric columns
                if is_numeric_dtype(df[col]):
                    query = query + ' & ' + f'`{col}` == {row[col]}'
                # string columns
                elif is_string_dtype(df[col]):
                    query = query + ' & ' f'`{col}` == "{row[col]}"'
                else:
                    print(col + "not numeric or string")
                    return False
            query = query[3:]
            if debug:
                print(query)
            rows = df.query(query)
            if debug:
                print(rows)
            if sum(rows[count_column]) < k:
                return False
    return True



In [None]:

columns_to_check = quasi_identifiers

is_k_anonymous(df_anonymized, 5, columns_to_check, 'label_count', debug=False)