In [1]:
import syft as sy
import torch
from torch import nn, optim
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [2]:
def load_uci_adult_dataset():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
        "hours-per-week", "native-country", "income"
    ]
    data = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
    data.dropna(inplace=True)

    features = ["age", "education-num", "hours-per-week", "capital-gain", "capital-loss"]
    target = "income"
    data = data[features + [target]]

    data["income_binary"] = LabelEncoder().fit_transform(data["income"])

    data = data[features + ["income_binary"]]
    
    print(data.head())
    print(data.shape)
    
    return data

uci_data = load_uci_adult_dataset()

   age  education-num  hours-per-week  capital-gain  capital-loss  \
0   39             13              40          2174             0   
1   50             13              13             0             0   
2   38              9              40             0             0   
3   53              7              40             0             0   
4   28             13              40             0             0   

   income_binary  
0              0  
1              0  
2              0  
3              0  
4              0  
(32561, 6)


In [3]:
# Split data into features and target
X = uci_data.drop(columns=["income_binary"]).values
y = uci_data["income_binary"].values

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train, X_test = torch.tensor(X_train, dtype=torch.float32), torch.tensor(X_test, dtype=torch.float32)
y_train, y_test = torch.tensor(y_train, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)


In [4]:
# Simulate two data holders by splitting the data
X_train_part1, y_train_part1 = X_train[:len(X_train) // 2], y_train[:len(y_train) // 2]
X_train_part2, y_train_part2 = X_train[len(X_train) // 2:], y_train[len(y_train) // 2:]


In [5]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

# Initialize the model
input_dim = X_train.shape[1]
model = LogisticRegressionModel(input_dim)

# Differential Privacy: Noise addition function
def add_noise(tensor, epsilon=0.1):
    noise = torch.normal(0, epsilon, tensor.size())
    return tensor + noise



In [6]:
def train_model(model, data, labels, epsilon=0.1, epochs=10):
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(data).squeeze()
        loss = criterion(predictions, labels)
        
        # Backward pass with differential privacy
        loss.backward()
        
        # Add noise to gradients for differential privacy
        for param in model.parameters():
            param.grad = add_noise(param.grad, epsilon=epsilon)
        
        optimizer.step()

    print(f"Training completed.")

# Train model on each data partition
train_model(model, X_train_part1, y_train_part1, epsilon=0.1, epochs=10)
train_model(model, X_train_part2, y_train_part2, epsilon=0.1, epochs=10)



Training completed.
Training completed.


In [7]:
# Make predictions on the test set
with torch.no_grad():
    y_pred = model(X_test).squeeze()
    y_pred = (y_pred > 0.5).float()  # Apply threshold for binary classification

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy after differential privacy training: {accuracy * 100:.2f}%")



Model accuracy after differential privacy training: 26.72%
