In [1]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd

In [2]:
file1 = "adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
# Drop some columns and rows
# Keep only 1000 rows
df = df.loc[0:999,]
print(df.shape)
# Drop some columns
df = df.drop(columns = ["fnlwgt", "workclass", "education-num", "occupation", "race", "native-country"])
df.head()

(1000, 15)


Unnamed: 0,age,education,marital-status,relationship,sex,capital-gain,capital-loss,hours-per-week,label
0,39,Bachelors,Never-married,Not-in-family,Male,2174,0,40,0
1,50,Bachelors,Married-civ-spouse,Husband,Male,0,0,13,0
2,38,HS-grad,Divorced,Not-in-family,Male,0,0,40,0
3,53,11th,Married-civ-spouse,Husband,Male,0,0,40,0
4,28,Bachelors,Married-civ-spouse,Wife,Female,0,0,40,0


In [4]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Avg columns
avg_columns = ['capital-gain', 'capital-loss']

# Run tabular_anonymizer
# Set k
k = 5

# List of quasi-identifiers for checking k-anonymity
quasi_identifiers = ['age',	'education', 'marital-status', 'relationship',
	'sex', 'capital-gain', 'capital-loss', 'hours-per-week']

print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(df, sensitive_columns, avg_columns=avg_columns)

# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(k=k)
df_anonymized.head(10)

Run tabular_anonymizer. Sensitive columns:  ['label'] , k= 5


Unnamed: 0,age,education,marital-status,relationship,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,[23-33],"[Prof-school,10th,Assoc-voc,Bachelors]","[Divorced,Separated]","[Not-in-family,Unmarried,Own-child]",[Male],0.0,0.0,[40-56],0,5
1,[37-60],"[7th-8th,Assoc-voc,1st-4th,5th-6th]","[Separated,Never-married,Married-spouse-absent]","[Not-in-family,Unmarried]",[Male],0.0,230.0,[32-48],0,5
2,[37-60],"[7th-8th,Assoc-voc,1st-4th,5th-6th]","[Separated,Never-married,Married-spouse-absent]","[Not-in-family,Unmarried]",[Male],0.0,230.0,[32-48],1,1
3,[22-36],"[Some-college,11th,HS-grad]","[Separated,Married-spouse-absent]","[Not-in-family,Other-relative,Unmarried]",[Male],459.25,0.0,[35-55],0,8
4,[19-32],"[Some-college,HS-grad]","[Divorced,Separated,Married-spouse-absent]","[Not-in-family,Own-child]",[Female],0.0,0.0,[20-38],0,5
5,[27-36],"[10th,HS-grad,Assoc-acdm]","[Separated,Married-spouse-absent]",[Unmarried],[Female],0.0,0.0,[25-42],0,5
6,[41-72],"[9th,Some-college,11th,HS-grad]","[Widowed,Married-spouse-absent,Never-married]","[Unmarried,Own-child]",[Female],66.0,0.0,[20-40],0,9
7,[42-71],"[Bachelors,Masters,9th,Some-college,11th,HS-grad]","[Widowed,Separated,Married-spouse-absent]","[Other-relative,Unmarried]",[Male],0.0,411.5,[2-50],0,5
8,[42-71],"[Bachelors,Masters,9th,Some-college,11th,HS-grad]","[Widowed,Separated,Married-spouse-absent]","[Other-relative,Unmarried]",[Male],0.0,411.5,[2-50],1,1
9,[39-58],"[12th,10th,Preschool,1st-4th]",[Married-civ-spouse],[Husband],[Male],0.0,209.666667,[40-60],0,7


In [17]:
def is_k_anonymous(df, k, quasi_identifiers, count_column):
    for index, row in df.iterrows():
        # If row count < k, search for other identical rows with different sensitive attribute
        if row[count_column] < k:
            query = ' & '.join([f'`{col}` == {row[col]}' for col in quasi_identifiers])
            print(query)
            rows = df.query(query)
            print(rows)
            if sum(rows[count_column]) < k:
                return False
    return True

# Miksi query ei toimi?


In [18]:
columns_to_check = ['sex']

is_k_anonymous(df_anonymized, 5, columns_to_check, 'label_count')

`sex` == ['Male']
Empty DataFrame
Columns: [age, education, marital-status, relationship, sex, capital-gain, capital-loss, hours-per-week, label, label_count]
Index: []


False

Cells below are for testing only (i.e. trash)

In [9]:
#df_anonymized['sex']
df_anonymized.loc[0, 'sex'] == ['Male']

True

In [15]:
df_anonymized.query("`sex` == ['Male']")

Unnamed: 0,age,education,marital-status,relationship,sex,capital-gain,capital-loss,hours-per-week,label,label_count


In [19]:
# Only numeric columns
num_columns = ['capital-gain', 'capital-loss']

is_k_anonymous(df_anonymized, 5, num_columns, 'label_count')

`capital-gain` == 0.0 & `capital-loss` == 230.0
        age                                          education  \
1   [37-60]                [7th-8th,Assoc-voc,1st-4th,5th-6th]   
2   [37-60]                [7th-8th,Assoc-voc,1st-4th,5th-6th]   
52  [31-33]  [Assoc-voc,Assoc-acdm,Bachelors,Some-college,H...   
53  [31-33]  [Assoc-voc,Assoc-acdm,Bachelors,Some-college,H...   

                                     marital-status  \
1   [Separated,Never-married,Married-spouse-absent]   
2   [Separated,Never-married,Married-spouse-absent]   
52                               [Divorced,Widowed]   
53                               [Divorced,Widowed]   

                  relationship       sex  capital-gain  capital-loss  \
1    [Not-in-family,Unmarried]    [Male]           0.0         230.0   
2    [Not-in-family,Unmarried]    [Male]           0.0         230.0   
52  [Unmarried,Other-relative]  [Female]           0.0         230.0   
53  [Unmarried,Other-relative]  [Female]           0.0   

True

In [None]:
# Original
#def isKAnonymized(df, k):
#    for index, row in df.iterrows():
#        query = ' & '.join([f'{col} == "{row[col]}"' for col in df.columns])
#        rows = df.query(query)
#        if rows.shape[0] < k:
#            return False
#    return True