In [1]:
import random

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    BertConfig,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from model.bert_configs import shadow, target

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [2]:
dataset = load_dataset("yelp_review_full", split=["train+test"])[0].train_test_split(
    test_size=0.5, stratify_by_column="label"
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
tokenized_dataset = dataset.map(
    lambda x: tokenizer(
        x["text"], return_tensors="pt", padding=True, max_length=512, truncation=True
    ),
    batched=True,
)
tokenized_dataset.set_format(type="torch")

In [4]:
target_subset = tokenized_dataset["train"].train_test_split(
    test_size=0.5, stratify_by_column="label"
)

In [5]:
N_SHADOWS = 10

shadow_subsets = []
for i in range(N_SHADOWS):
    shadow_subsets.append(
        tokenized_dataset["test"].train_test_split(
            test_size=0.5, stratify_by_column="label"
        )
    )

In [6]:
def compute_metrics(eval_preds):
    accuracy = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train Target

In [7]:
target_bert_config = BertConfig(**target)
target_classifier = BertForSequenceClassification(config=target_bert_config)

In [8]:
training_args = TrainingArguments(
    output_dir="saved_models/target",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model=target_classifier,
    args=training_args,
    train_dataset=target_subset["train"],
    eval_dataset=target_subset["test"],
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()

  0%|          | 0/27350 [00:00<?, ?it/s]

{'loss': 1.3388, 'grad_norm': 8.534919738769531, 'learning_rate': 4.9085923217550275e-05, 'epoch': 0.18}
{'loss': 1.0463, 'grad_norm': 18.86072540283203, 'learning_rate': 4.817184643510055e-05, 'epoch': 0.37}
{'loss': 1.0083, 'grad_norm': 6.08780574798584, 'learning_rate': 4.725776965265082e-05, 'epoch': 0.55}
{'loss': 0.9996, 'grad_norm': 5.470260143280029, 'learning_rate': 4.63436928702011e-05, 'epoch': 0.73}
{'loss': 0.973, 'grad_norm': 11.012569427490234, 'learning_rate': 4.542961608775137e-05, 'epoch': 0.91}


  0%|          | 0/2735 [00:00<?, ?it/s]

# Train Shadows

In [None]:
shadow_bert_config = BertConfig(**shadow)
for i in range(N_SHADOWS):
    shadow_classifier = BertForSequenceClassification(config=shadow_bert_config)
    training_args = TrainingArguments(
        output_dir=f"saved_models/shadow_{i}",
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to="none",
    )
    trainer = Trainer(
        model=shadow_classifier,
        args=training_args,
        train_dataset=shadow_subsets[i]["train"],
        eval_dataset=shadow_subsets[i]["test"],
        compute_metrics=compute_metrics,
    )
    trainer.train()

# Train Attacker

## Dataset Creation

In [None]:
from transformers import AutoModelForSequenceClassification
from torch.nn import functional as F
from torch.utils.data import DataLoader

attack_dataset_X = []  # shadow model predicition scores
attack_dataset_y = []  # membership / non-membership

for i in range(10):
    shadow_model = AutoModelForSequenceClassification.from_pretrained(
        f"saved_models/shadow_{i}"
    )
    shadow_train_dl = DataLoader(shadow_subsets[i]["train"], batch_size=100)
    shadow_test_dl = DataLoader(shadow_subsets[i]["test"], batch_size=100)
    shadow_model.eval()
    for batch in shadow_train_dl:
        with torch.inference_mode():
            logits = shadow_model(**batch)["logits"]
        probs = F.softmax(logits, dim=-1).cpu().numpy()
        attack_dataset_X.append(probs)
        attack_dataset_y.append(np.ones(len(probs)))
    for batch in shadow_test_dl:
        with torch.inference_mode():
            logits = shadow_model(**batch)["logits"]
        probs = F.softmax(logits, dim=-1).cpu().numpy()
        attack_dataset_X.append(probs)
        attack_dataset_y.append(np.zeros(len(probs)))

attack_X_train = np.hstack(attack_dataset_X)
attack_y_train = np.hstack(attack_dataset_y)

np.savez_compressed(
    "saved_datasets/attack_train_dataset",
    attack_X_train=attack_X_train,
    attack_y_train=attack_y_train,
)

## Training

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

base_xgb_attacker = XGBClassifier(objective="binary:logistic", eval_metric="auc")
gridsearch_clf = GridSearchCV(
    base_xgb_attacker,
    {
        "max_depth": [1, 2, 3, 4, 5],
        "n_estimators": [2, 5, 10, 50]
    },
    verbose=1
)

gridsearch_clf.fit(attack_X_train, attack_y_train)
best_xgb_attacker = gridsearch_clf.best_estimator_
best_xgb_attacker.save_model("saved_models/attacker/XGB_attacker.json")

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(attack_y_train, best_xgb_attacker.predict_proba(attack_X_train)[:, 1])

# Inference

## Test Dataset Creation

In [None]:
from transformers import AutoModelForSequenceClassification
from torch.nn import functional as F
from torch.utils.data import DataLoader

attack_dataset_X = []  # target model predicition scores
attack_dataset_y = []  # membership / non-membership

target_model = AutoModelForSequenceClassification.from_pretrained(
    "saved_models/target"
)
target_train_dl = DataLoader(target_subset["train"], batch_size=100)
target_test_dl = DataLoader(target_subset["test"], batch_size=100)
target_model.eval()
for batch in target_train_dl:
    with torch.inference_mode():
        logits = target_model(**batch)["logits"]
    probs = F.softmax(logits, dim=-1).cpu().numpy()
    attack_dataset_X.append(probs)
    attack_dataset_y.append(np.ones(len(probs)))
for batch in target_test_dl:
    with torch.inference_mode():
        logits = target_model(**batch)["logits"]
    probs = F.softmax(logits, dim=-1).cpu().numpy()
    attack_dataset_X.append(probs)
    attack_dataset_y.append(np.zeros(len(probs)))

attack_X_test = np.hstack(attack_dataset_X)
attack_y_test = np.hstack(attack_dataset_y)

np.savez_compressed(
    "saved_datasets/attack_test_dataset",
    attack_X_test=attack_X_test,
    attack_y_test=attack_y_test,
)

## Run Attack