In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [2]:
# 下载成人数据集
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# 读取数据
adult_data = pd.read_csv(url, names=column_names, header=None, na_values=" ?")
adult_data.dropna(inplace=True)

print("Adult Dataset:")
print(adult_data.head())

Adult Dataset:
   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   Unite

In [3]:
def mondrian_k_anonymity(data, k):
    def split_attribute(attribute, data):
        unique_vals = data[attribute].unique()
        if len(unique_vals) > 1:
            median = np.median(data[attribute])
            return data[data[attribute] <= median], data[data[attribute] > median]
        return data, pd.DataFrame()

    def partition(data, k):
        if len(data) < 2 * k:
            return [data]

        best_attr = None
        best_sets = None
        max_size = 0

        for attribute in data.columns:
            if attribute == 'income':
                continue
            set1, set2 = split_attribute(attribute, data)
            min_size = min(len(set1), len(set2))
            if min_size >= k and min_size > max_size:
                best_attr = attribute
                best_sets = (set1, set2)
                max_size = min_size

        if best_attr is None:
            return [data]

        return partition(best_sets[0], k) + partition(best_sets[1], k)

    partitions = partition(data, k)
    anonymized_data = []

    for partition in partitions:
        for column in partition.columns:
            if column == 'income':
                continue
            if partition[column].dtype == 'object':
                most_common_value = partition[column].mode()[0]
                partition[column] = most_common_value
            else:
                mean_value = partition[column].mean()
                partition[column] = mean_value

        anonymized_data.append(partition)

    return pd.concat(anonymized_data, ignore_index=True)

In [4]:
# 对数据进行编码
encoded_data = adult_data.copy()
for column in encoded_data.columns:
    if encoded_data[column].dtype == 'object':
        le = LabelEncoder()
        encoded_data[column] = le.fit_transform(encoded_data[column])

# 对标签进行编码（-1和1）
encoded_data['income'] = encoded_data['income'].apply(lambda x: 1 if x == 1 else -1)

# 标准化数据
scaler = StandardScaler()
encoded_data[encoded_data.columns[:-1]] = scaler.fit_transform(encoded_data[encoded_data.columns[:-1]])

In [5]:
k = 5
anonymized_data = mondrian_k_anonymity(encoded_data, k)

print("Anonymized Adult Dataset:")
print(anonymized_data.head())

Anonymized Adult Dataset:
        age  workclass    fnlwgt  education  education-num  marital-status  \
0 -1.241994   0.053125 -1.612375   0.043606       -0.63582        0.447176   
1 -1.241994   0.053125 -1.612375   0.043606       -0.63582        0.447176   
2 -1.241994   0.053125 -1.612375   0.043606       -0.63582        0.447176   
3 -1.241994   0.053125 -1.612375   0.043606       -0.63582        0.447176   
4 -1.241994   0.053125 -1.612375   0.043606       -0.63582        0.447176   

   occupation  relationship     race       sex  capital-gain  capital-loss  \
0    0.196091        0.4413 -1.56177 -0.909352     -0.078229     -0.218586   
1    0.196091        0.4413 -1.56177 -0.909352     -0.078229     -0.218586   
2    0.196091        0.4413 -1.56177 -0.909352     -0.078229     -0.218586   
3    0.196091        0.4413 -1.56177 -0.909352     -0.078229     -0.218586   
4    0.196091        0.4413 -1.56177 -0.909352     -0.078229     -0.218586   

   hours-per-week  native-country  i

In [6]:
# 将数据分为训练集和测试集
X_original = encoded_data.drop('income', axis=1)
y_original = encoded_data['income']
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(X_original, y_original, test_size=0.3, random_state=42)

X_anonymized = anonymized_data.drop('income', axis=1)
y_anonymized = anonymized_data['income']
X_train_anonymized, X_test_anonymized, y_train_anonymized, y_test_anonymized = train_test_split(X_anonymized, y_anonymized, test_size=0.3, random_state=42)

# 转换数据为PyTorch张量
X_train_original = torch.tensor(X_train_original.values, dtype=torch.float32)
y_train_original = torch.tensor(y_train_original.values, dtype=torch.float32).unsqueeze(1)
X_test_original = torch.tensor(X_test_original.values, dtype=torch.float32)
y_test_original = torch.tensor(y_test_original.values, dtype=torch.float32).unsqueeze(1)

X_train_anonymized = torch.tensor(X_train_anonymized.values, dtype=torch.float32)
y_train_anonymized = torch.tensor(y_train_anonymized.values, dtype=torch.float32).unsqueeze(1)
X_test_anonymized = torch.tensor(X_test_anonymized.values, dtype=torch.float32)
y_test_anonymized = torch.tensor(y_test_anonymized.values, dtype=torch.float32).unsqueeze(1)

In [7]:
# 定义SVM模型
class SVM(nn.Module):
    def __init__(self, input_dim):
        super(SVM, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

# 定义损失函数和优化器
def hinge_loss(output, target):
    return torch.mean(torch.clamp(1 - output * target, min=0))

def train(model, X_train, y_train, epochs=100, lr=0.01):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = hinge_loss(output, y_train)
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')

Training Original data


In [8]:
# 训练和测试原始数据上的SVM模型
model_original = SVM(X_train_original.shape[1])
train(model_original, X_train_original, y_train_original)

model_original.eval()
with torch.no_grad():
    y_pred_original = model_original(X_test_original)
    y_pred_original = torch.sign(y_pred_original)

Epoch [10/100], Loss: 0.8653
Epoch [20/100], Loss: 0.6865
Epoch [30/100], Loss: 0.5845
Epoch [40/100], Loss: 0.5419
Epoch [50/100], Loss: 0.5106
Epoch [60/100], Loss: 0.4844
Epoch [70/100], Loss: 0.4647
Epoch [80/100], Loss: 0.4506
Epoch [90/100], Loss: 0.4407
Epoch [100/100], Loss: 0.4337


Training anonymized data


In [9]:
# 训练和测试匿名化数据上的SVM模型
model_anonymized = SVM(X_train_anonymized.shape[1])
train(model_anonymized, X_train_anonymized, y_train_anonymized)

model_anonymized.eval()
with torch.no_grad():
    y_pred_anonymized = model_anonymized(X_test_anonymized)
    y_pred_anonymized = torch.sign(y_pred_anonymized)

Epoch [10/100], Loss: 0.8872
Epoch [20/100], Loss: 0.7807
Epoch [30/100], Loss: 0.7160
Epoch [40/100], Loss: 0.6814
Epoch [50/100], Loss: 0.6534
Epoch [60/100], Loss: 0.6262
Epoch [70/100], Loss: 0.6001
Epoch [80/100], Loss: 0.5764
Epoch [90/100], Loss: 0.5553
Epoch [100/100], Loss: 0.5376


In [10]:
# 计算评估指标
y_pred_original_np = y_pred_original.numpy()
y_test_original_np = y_test_original.numpy()

y_pred_anonymized_np = y_pred_anonymized.numpy()
y_test_anonymized_np = y_test_anonymized.numpy()

results = {
    'Original Data': {
        'Accuracy': accuracy_score(y_test_original_np, y_pred_original_np),
        'Precision': precision_score(y_test_original_np, y_pred_original_np),
        'Recall': recall_score(y_test_original_np, y_pred_original_np),
        'AUC': roc_auc_score(y_test_original_np, y_pred_original_np)
    },
    'Anonymized Data': {
        'Accuracy': accuracy_score(y_test_anonymized_np, y_pred_anonymized_np),
        'Precision': precision_score(y_test_anonymized_np, y_pred_anonymized_np),
        'Recall': recall_score(y_test_anonymized_np, y_pred_anonymized_np),
        'AUC': roc_auc_score(y_test_anonymized_np, y_pred_anonymized_np)
    }
}

results_df = pd.DataFrame(results)
print("Evaluation Results:")
print(results_df)

Evaluation Results:
           Original Data  Anonymized Data
Accuracy        0.811913         0.747817
Precision       0.764599         0.537500
Recall          0.367222         0.074588
AUC             0.664548         0.526320
