In [1]:
# !pip install adversarial-robustness-toolbox

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Task-1

In [None]:
from art.estimators.classification.scikitlearn import ScikitlearnLogisticRegression
from art.attacks.inference.reconstruction import DatabaseReconstruction

In [None]:
# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=512)
model.fit(X_train, y_train)

In [None]:
X_train.shape, y_train.shape

In [None]:
non_private_art = ScikitlearnLogisticRegression(model)
dbrecon = DatabaseReconstruction(non_private_art)

In [None]:
def get_inference_rmse(X_train, x_pred, target_row):
    # print("Inference RMSE: {}".format(
    # np.sqrt(((X_train[target_row] - x) ** 2).sum() / X_train.shape[1])))
    val = np.sqrt(((X_train[target_row] - x_pred) ** 2).sum() / X_train.shape[1])
    return val

In [None]:
def launch_attack(X_train, y_train, dbrecon):
    recon_attempts = []
    rmse_scores = []
    
    for target_row in tqdm(range(len(X_train))):
        X_train_removed = np.delete(X_train, target_row, axis=0)
        y_train_removed = np.delete(y_train, target_row, axis=0)

        x, y = dbrecon.reconstruct(X_train_removed, y_train_removed)
        success = (np.argmax(y) == y_train[target_row])
        rmse = get_inference_rmse(X_train, x, target_row)

        recon_attempts.append(success)
        rmse_scores.append(rmse)
        
    return recon_attempts, rmse_scores

In [None]:
def analyze_attack(recon_attempts, rmse_scores):
    # Calculate the sum of successful recon_attempts
    successful_attempts = sum(recon_attempts)
    
    # Calculate the average of rmse_scores
    average_rmse = sum(rmse_scores) / len(rmse_scores) if rmse_scores else 0
    
    # Print the results nicely
    print(f"Total Successful Reconstructions: {successful_attempts}/{len(recon_attempts)}")
    print(f"Average RMSE Score: {average_rmse:.10f}")

In [None]:
recon_attempts, rmse_scores = launch_attack(X_train, y_train, dbrecon)

In [None]:
analyze_attack(recon_attempts, rmse_scores)

# Task-2

In [None]:
from sklearn.datasets import load_breast_cancer

# Load Breast Cancer Wisconsin (Diagnostic) dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=10240)
model.fit(X_train, y_train)

In [None]:
X_train.shape, y_train.shape

In [None]:
non_private_art = ScikitlearnLogisticRegression(model)
dbrecon = DatabaseReconstruction(non_private_art)

In [None]:
recon_attempts, rmse_scores = launch_attack(X_train, y_train, dbrecon)

In [None]:
analyze_attack(recon_attempts, rmse_scores)

Reasons for Failure:
If the reconstruction fails, it could be due to various reasons:

Overlap between classes: If the classes are not well-separated in the feature space, removing a sample and trying to reconstruct it may result in misclassification, as the decision boundary might not be clear-cut.

Highly correlated features: If the features are highly correlated, removing one sample might not significantly affect the decision boundary, leading to successful reconstruction. However, if features are not highly correlated, removing a sample might have a larger impact on the decision boundary, making reconstruction more difficult.

Non-linear decision boundary: Logistic regression assumes a linear decision boundary. If the true decision boundary is non-linear, removing samples and attempting reconstruction may fail, as logistic regression cannot capture complex relationships between features.

Imbalanced classes: If the classes are imbalanced, removing samples from the majority class might have less impact on the decision boundary compared to removing samples from the minority class.

Noise in the data: If the dataset contains noise or outliers, removing samples may not significantly affect the decision boundary, leading to successful reconstruction even if the removed sample is important for classification.

It's important to consider these factors when interpreting the results of the reconstruction attack and assessing the security of the dataset.

In [None]:
# Convert to DataFrame for easier manipulation and visualization
df = pd.DataFrame(data=X, columns=data.feature_names)
df['target'] = y  # Adding the target variable to the DataFrame

In [None]:
# Display the first few rows of the dataframe
df.head()

In [None]:
# Summary statistics for numerical features
df.describe()

In [None]:
# Check for null values
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Countplot for the target variable
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable')
plt.show()

In [None]:
# Histograms for features to understand distributions
df.drop('target', axis=1).hist(bins=20, figsize=(20, 15))
plt.show()

In [None]:
# Box plots to check for outliers
plt.figure(figsize=(20, 10))
df.drop('target', axis=1).boxplot()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Correlation matrix to understand relationships between variables
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm')
plt.show()

In [None]:
# Pairplot for a subset of features
sns.pairplot(df, vars=df.columns[:5], hue='target')
plt.show()

# Task-3

In [None]:
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from art.estimators.classification.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnGaussianNB

In [None]:
# Generate a synthetic dataset for a four-class classification problem
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=4, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Initialize the classifiers
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
gaussian_nb_model = GaussianNB()

# Train the Logistic Regression model
logistic_model.fit(X_train, y_train)

# Train the Gaussian Naive Bayes model
gaussian_nb_model.fit(X_train, y_train)

# Evaluate the training accuracy of both models
logistic_accuracy = logistic_model.score(X_train, y_train)
gaussian_nb_accuracy = gaussian_nb_model.score(X_train, y_train)

print(f"Logistic Regression Training Accuracy: {logistic_accuracy*100:.2f}%")
print(f"Gaussian Naive Bayes Training Accuracy: {gaussian_nb_accuracy*100:.2f}%")

In [None]:
non_private_art1 = ScikitlearnLogisticRegression(logistic_model)
dbrecon1 = DatabaseReconstruction(non_private_art1)

non_private_art2 = ScikitlearnGaussianNB(gaussian_nb_model)
dbrecon2 = DatabaseReconstruction(non_private_art2)

In [None]:
def launch_attack(X_train, y_train, dbrecon1, dbrecon2, target_rows):
    lr_recon_attempts = []
    lr_rmse_scores = []
    nb_recon_attempts = []
    nb_rmse_scores = []
    
    for target_row in tqdm(target_rows):
        X_train_removed = np.delete(X_train, target_row, axis=0)
        y_train_removed = np.delete(y_train, target_row, axis=0)

        x1, y1 = dbrecon1.reconstruct(X_train_removed, y_train_removed)
        x2, y2 = dbrecon2.reconstruct(X_train_removed, y_train_removed)
        
        lr_success = (np.argmax(y1) == y_train[target_row])
        lr_rmse = get_inference_rmse(X_train, x1, target_row)
        nb_success = (np.argmax(y2) == y_train[target_row])
        nb_rmse = get_inference_rmse(X_train, x2, target_row)

        lr_recon_attempts.append(lr_success)
        lr_rmse_scores.append(lr_rmse)
        nb_recon_attempts.append(nb_success)
        nb_rmse_scores.append(nb_rmse)
        
    return lr_recon_attempts, lr_rmse_scores, nb_recon_attempts, nb_rmse_scores

In [None]:
# Select 100 random samples from the dataset
np.random.seed(42)
target_rows = np.random.choice(range(len(X)), size=100, replace=False)

In [None]:
lr_recon_attempts, lr_rmse_scores, nb_recon_attempts, nb_rmse_scores = launch_attack(X_train, y_train, dbrecon1, dbrecon2, target_rows)

In [None]:
analyze_attack(lr_recon_attempts, lr_rmse_scores)

In [None]:
analyze_attack(nb_recon_attempts, nb_rmse_scores)

# Task-4

In [None]:
# !pip install diffprivlib

In [None]:
from diffprivlib.models import GaussianNB as DPGaussianNB

# Initialize the differentially private GaussianNB model
# Note: `epsilon` controls the privacy guarantee. A smaller epsilon means more privacy but potentially less accuracy.
# `data_norm` is not a parameter for DPGaussianNB, as it works differently from DPLogisticRegression.
dp_gaussian_nb_model = DPGaussianNB(epsilon=1.0)

# Train the model with differential privacy
dp_gaussian_nb_model.fit(X_train, y_train)

# Evaluate its accuracy on the test set
dp_gaussian_nb_accuracy = dp_gaussian_nb_model.score(X_test, y_test)
print(f"Differentially Private GaussianNB Test Accuracy: {dp_gaussian_nb_accuracy*100:.2f}%")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from diffprivlib.models import GaussianNB as DPGaussianNB

# Assuming X_train, X_test, y_train, y_test are already defined
epsilons = np.logspace(-2, 2, 50)
accuracies_gnb = []

for eps in epsilons:
    dp_gnb_model = DPGaussianNB(epsilon=eps)
    dp_gnb_model.fit(X_train, y_train)
    accuracy = dp_gnb_model.score(X_test, y_test)
    accuracies_gnb.append(accuracy)

# Plot the accuracy vs epsilon for GaussianNB
plt.figure(figsize=(10, 6))
plt.semilogx(epsilons, accuracies_gnb, label='DP GaussianNB', color='orange')
plt.title('Differentially Private GaussianNB: Accuracy vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from diffprivlib.models import LogisticRegression as DPLogisticRegression
from diffprivlib.models import GaussianNB as DPGaussianNB

# Initialize the differentially private logistic regression model
dp_logistic_model = DPLogisticRegression(epsilon=1.0, data_norm=12.0)

# Train the model with differential privacy
dp_logistic_model.fit(X_train, y_train)

# Evaluate its accuracy on the test set
dp_logistic_accuracy = dp_logistic_model.score(X_test, y_test)
print(f"Differentially Private Logistic Regression Test Accuracy: {dp_logistic_accuracy*100:.2f}%")

In [None]:
import matplotlib.pyplot as plt

epsilons = np.logspace(-2, 2, 50)
accuracies = []

for eps in epsilons:
    dp_model = DPLogisticRegression(epsilon=eps, data_norm=12.0)
    dp_model.fit(X_train, y_train)
    accuracy = dp_model.score(X_test, y_test)
    accuracies.append(accuracy)

# Plot the accuracy vs epsilon
plt.figure(figsize=(10, 6))
plt.semilogx(epsilons, accuracies)
plt.title('Accuracy vs. Epsilon')
plt.xlabel('Epsilon')
plt.ylabel('Accuracy')
plt.show()