In [1]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [2]:
file1 = "adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
# Drop some columns and rows
# Keep only 1000 rows
df = df.loc[0:999,]
print(df.shape)
# Drop some columns
df = df.drop(columns = ["fnlwgt", "workclass", "education-num", "occupation", "marital-status", "relationship",
                         "race", "native-country"])
df.head()


(1000, 15)


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label
0,39,Bachelors,Male,2174,0,40,0
1,50,Bachelors,Male,0,0,13,0
2,38,HS-grad,Male,0,0,40,0
3,53,11th,Male,0,0,40,0
4,28,Bachelors,Female,0,0,40,0


In [4]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Avg columns
#avg_columns = ['capital-gain', 'capital-loss']

# Run tabular_anonymizer
# Set k
k = 5

# List of quasi-identifiers for checking k-anonymity
quasi_identifiers = ['age',	'education', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week']

print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(sensitive_columns, format_to_str=True)

# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(df, k=k)# anonymize_l_diversity(df, k=k, l=l)
df_anonymized.head()

Run tabular_anonymizer. Sensitive columns:  ['label'] , k= 5


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,0,7
1,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,1,1
2,17 - 18,"12th, 11th",Male,0 - 2176,0 - 0,10 - 18,0,8
3,17 - 34,"10th, Masters, 11th, 12th",Male,0 - 7298,0 - 0,20 - 35,0,7
4,17 - 34,"10th, Masters, 11th, 12th",Male,0 - 7298,0 - 0,20 - 35,1,1


In [5]:
df_anonymized.sort_values(by='label_count', ascending=False)

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
144,26 - 28,"HS-grad, 9th",Male,0 - 0,0 - 1980,40 - 70,0,17
147,35 - 36,"HS-grad, 9th",Male,0 - 7298,0 - 0,40 - 60,0,11
143,24 - 25,"HS-grad, 9th",Male,0 - 4101,0 - 1721,40 - 60,0,11
149,23 - 24,"Assoc-voc, Some-college",Male,0 - 0,0 - 0,40 - 56,0,10
44,27 - 36,"HS-grad, Some-college, Bachelors",Female,0 - 0,0 - 0,32 - 38,0,10
...,...,...,...,...,...,...,...,...
194,31 - 33,Some-college,Male,0 - 0,0 - 0,40 - 50,1,1
69,58 - 76,"Doctorate, Masters, Some-college, HS-grad",Male,0 - 0,0 - 1816,1 - 15,1,1
67,50 - 55,"HS-grad, Bachelors",Male,0 - 0,0 - 0,30 - 38,1,1
62,38 - 43,"Masters, Bachelors",Male,0 - 0,0 - 0,24 - 38,1,1


In [6]:
l = 2

df_ldiv = p.anonymize_l_diversity(df, k=2, l=2)
df_ldiv.head()

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"HS-grad, 11th, Bachelors, Some-college",Female,0 - 4064,0 - 1719,5 - 38,0,60
1,17 - 36,"HS-grad, 11th, Bachelors, Some-college",Female,0 - 4064,0 - 1719,5 - 38,1,1
2,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,0,7
3,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,1,1
4,22 - 36,"Assoc-acdm, Doctorate, 10th, Assoc-voc, 1st-4t...",Female,0 - 2174,0 - 1669,40 - 70,0,18


In [7]:
df_ldiv.sort_values(by='label_count', ascending=False)

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
30,19 - 36,"HS-grad, 9th",Male,0 - 7688,0 - 1980,40 - 72,0,85
32,19 - 36,"Assoc-voc, Some-college",Male,0 - 5178,0 - 2377,40 - 70,0,62
0,17 - 36,"HS-grad, 11th, Bachelors, Some-college",Female,0 - 4064,0 - 1719,5 - 38,0,60
6,17 - 36,"Assoc-acdm, 9th, Assoc-voc, Bachelors, HS-grad...",Male,0 - 7298,0 - 1721,6 - 38,0,45
26,18 - 28,"HS-grad, Some-college, 11th",Female,0 - 7298,0 - 1762,40 - 50,0,33
...,...,...,...,...,...,...,...,...
89,43 - 44,Masters,Female,0 - 0,0 - 0,40 - 50,1,1
91,37 - 41,HS-grad,Female,0 - 0,0 - 0,40 - 60,1,1
93,37 - 43,Some-college,Female,0 - 0,0 - 1887,40 - 50,1,1
95,45 - 47,Bachelors,Female,0 - 0,0 - 0,40 - 55,1,1


In [16]:
t = 0.3

df_tclose = p.anonymize_t_closeness(df, k=k, t=t)
df_tclose.head()

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,0,7
1,17 - 36,"10th, Assoc-voc, Assoc-acdm",Female,0 - 0,0 - 0,15 - 35,1,1
2,37 - 67,"Prof-school, Assoc-voc, 7th-8th, 5th-6th",Male,0 - 20051,0 - 2415,25 - 65,0,16
3,37 - 67,"Prof-school, Assoc-voc, 7th-8th, 5th-6th",Male,0 - 20051,0 - 2415,25 - 65,1,19
4,17 - 18,"12th, 11th",Male,0 - 2176,0 - 0,10 - 18,0,8


t-closeness is broken in Anonypy. The function is fixed in this repository.

In [17]:
df_tclose.sort_values(by='label_count', ascending=False)

Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
35,37 - 45,"Doctorate, Assoc-acdm, 11th, Masters, Bachelor...",Male,0 - 15024,0 - 2051,44 - 99,1,36
37,46 - 52,"Assoc-acdm, Doctorate, 9th, 11th, Masters, Bac...",Male,0 - 15024,0 - 2415,40 - 98,1,35
34,37 - 45,"Doctorate, Assoc-acdm, 11th, Masters, Bachelor...",Male,0 - 15024,0 - 2051,44 - 99,0,32
36,46 - 52,"Assoc-acdm, Doctorate, 9th, 11th, Masters, Bac...",Male,0 - 15024,0 - 2415,40 - 98,0,31
3,37 - 67,"Prof-school, Assoc-voc, 7th-8th, 5th-6th",Male,0 - 20051,0 - 2415,25 - 65,1,19
...,...,...,...,...,...,...,...,...
149,59 - 59,"HS-grad, Some-college",Male,0 - 4064,0 - 0,40 - 48,1,1
113,45 - 47,Bachelors,Female,0 - 0,0 - 0,40 - 55,1,1
145,53 - 57,"Masters, Some-college",Male,0 - 15024,0 - 0,40 - 45,1,1
119,49 - 55,"Masters, Some-college, 9th",Female,0 - 6849,0 - 0,40 - 60,1,1
