# Testing the Performance of an ML Model before and after K-anonymity - exploring the privacy-utility tradeoff

## Dataset importing and preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

df = pd.read_csv('./adult/adult.data', header=None, names=column_names, na_values=' ?', skipinitialspace=True)

print(df.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [3]:
categorical_columns = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country", "income"
]

df[categorical_columns] = df[categorical_columns].astype('category')

for col in df.columns:
    print(f"{col}: {df[col].dtype}")

df.replace(' ?', pd.NA, inplace=True)
df.dropna(inplace=True)

X = df.drop('income', axis=1)
y = df['income']

X_encoded = pd.get_dummies(X, drop_first=True)
y_encoded = y.cat.codes

age: int64
workclass: category
fnlwgt: int64
education: category
education-num: int64
marital-status: category
occupation: category
relationship: category
race: category
sex: category
capital-gain: int64
capital-loss: int64
hours-per-week: int64
native-country: category
income: category


## Training before K-Anonymization

In [4]:
def train_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [5]:
print(X_encoded[X_encoded['age'].isna()])


Empty DataFrame
Columns: [age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week, workclass_Federal-gov, workclass_Local-gov, workclass_Never-worked, workclass_Private, workclass_Self-emp-inc, workclass_Self-emp-not-inc, workclass_State-gov, workclass_Without-pay, education_11th, education_12th, education_1st-4th, education_5th-6th, education_7th-8th, education_9th, education_Assoc-acdm, education_Assoc-voc, education_Bachelors, education_Doctorate, education_HS-grad, education_Masters, education_Preschool, education_Prof-school, education_Some-college, marital-status_Married-AF-spouse, marital-status_Married-civ-spouse, marital-status_Married-spouse-absent, marital-status_Never-married, marital-status_Separated, marital-status_Widowed, occupation_Adm-clerical, occupation_Armed-Forces, occupation_Craft-repair, occupation_Exec-managerial, occupation_Farming-fishing, occupation_Handlers-cleaners, occupation_Machine-op-inspct, occupation_Other-service, occupation_Priv-hous

In [6]:
print("Original Model:")
train_and_evaluate(X_encoded, y_encoded)

Original Model:
Accuracy : 0.8478
Precision: 0.7393
Recall   : 0.5703
F1 Score : 0.6439
Confusion Matrix:
[[4626  316]
 [ 675  896]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Training after K-Anonymization

In [7]:
# Ensure age is numeric
X_encoded['age'] = pd.to_numeric(X_encoded['age'], errors='coerce')

# Drop rows with NaN in age, if any
X_encoded.dropna(subset=['age'], inplace=True)

# Round down to the nearest multiple of 10
X_encoded['age'] = (X_encoded['age'] // 10) * 10

# Convert to int type (optional, for cleanliness)
X_encoded['age'] = X_encoded['age'].astype(int)

# Print to verify
print(X_encoded['age'].value_counts().sort_index())


age
10    1657
20    8054
30    8613
40    7175
50    4418
60    2015
70     508
80      78
90      43
Name: count, dtype: int64


In [8]:
print(X_encoded[X_encoded['age'].isna()])


Empty DataFrame
Columns: [age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week, workclass_Federal-gov, workclass_Local-gov, workclass_Never-worked, workclass_Private, workclass_Self-emp-inc, workclass_Self-emp-not-inc, workclass_State-gov, workclass_Without-pay, education_11th, education_12th, education_1st-4th, education_5th-6th, education_7th-8th, education_9th, education_Assoc-acdm, education_Assoc-voc, education_Bachelors, education_Doctorate, education_HS-grad, education_Masters, education_Preschool, education_Prof-school, education_Some-college, marital-status_Married-AF-spouse, marital-status_Married-civ-spouse, marital-status_Married-spouse-absent, marital-status_Never-married, marital-status_Separated, marital-status_Widowed, occupation_Adm-clerical, occupation_Armed-Forces, occupation_Craft-repair, occupation_Exec-managerial, occupation_Farming-fishing, occupation_Handlers-cleaners, occupation_Machine-op-inspct, occupation_Other-service, occupation_Priv-hous

In [9]:
print("\nModel After K-Anonymization:")
train_and_evaluate(X_encoded, y_encoded)


Model After K-Anonymization:
Accuracy : 0.8477
Precision: 0.7260
Recall   : 0.5920
F1 Score : 0.6522
Confusion Matrix:
[[4591  351]
 [ 641  930]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
