In [51]:
from tabular_anonymizer import DataFrameAnonymizer
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [52]:
file1 = "adult.csv"
df = pd.read_csv(file1, sep=",", index_col=0)
df.reset_index()
df.index = range(len(df))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [53]:
# Drop some columns and rows
# Keep only 1000 rows
df = df.loc[0:999,]
print(df.shape)
# Drop some columns
df = df.drop(columns = ["fnlwgt", "workclass", "education-num", "occupation", "marital-status", "relationship",
                         "race", "native-country"])
df.head()

(1000, 15)


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label
0,39,Bachelors,Male,2174,0,40,0
1,50,Bachelors,Male,0,0,13,0
2,38,HS-grad,Male,0,0,40,0
3,53,11th,Male,0,0,40,0
4,28,Bachelors,Female,0,0,40,0


In [54]:
# Setup Sensitive attributes you dont want to alter
sensitive_columns = ['label']
# Avg columns
avg_columns = ['capital-gain', 'capital-loss']

# Run tabular_anonymizer
# Set k
k = 5

# List of quasi-identifiers for checking k-anonymity
quasi_identifiers = ['age',	'education', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week']

print("Run tabular_anonymizer. Sensitive columns: ", sensitive_columns, ", k=", k)
p = DataFrameAnonymizer(sensitive_columns, avg_columns=avg_columns, format_to_str=True)

# New anonymized dataframe is formed
df_anonymized = p.anonymize_k_anonymity(df, k=k)
df_anonymized.head()

Run tabular_anonymizer. Sensitive columns:  ['label'] , k= 5


Unnamed: 0,age,education,sex,capital-gain,capital-loss,hours-per-week,label,label_count
0,17 - 36,"Assoc-voc, 10th, Assoc-acdm",Female,0.0,0.0,15 - 35,0,7
1,17 - 36,"Assoc-voc, 10th, Assoc-acdm",Female,0.0,0.0,15 - 35,1,1
2,17 - 18,"12th, 11th",Male,272.0,0.0,10 - 18,0,8
3,17 - 34,"Masters, 10th, 12th, 11th",Male,912.25,0.0,20 - 35,0,7
4,17 - 34,"Masters, 10th, 12th, 11th",Male,912.25,0.0,20 - 35,1,1


In [55]:
# Function to check if a dataframe is k-anonymous. Works only with numeric and string data types, not lists.
def is_k_anonymous(df, k, quasi_identifiers, count_column, debug=False):
    for index, row in df.iterrows():
        # If row count < k, search for other identical rows with different sensitive attribute
        if row[count_column] < k:
            query = ''
            for col in quasi_identifiers:
                # numeric columns
                if is_numeric_dtype(df[col]):
                    query = query + ' & ' + f'`{col}` == {row[col]}'
                # string columns
                elif is_string_dtype(df[col]):
                    query = query + ' & ' f'`{col}` == "{row[col]}"'
                else:
                    print(col + "not numeric or string")
                    return False
            query = query[3:]
            if debug:
                print(query)
            rows = df.query(query)
            if debug:
                print(rows)
            if sum(rows[count_column]) < k:
                return False
    return True



In [56]:
columns_to_check = quasi_identifiers

is_k_anonymous(df_anonymized, 5, columns_to_check, 'label_count', debug=False)

True