#### unzip data from tar.gz

In [2]:

!tar -xzvf "/content/drive/MyDrive/Python/Diplomatiki/MIA_purchase_dataset/dataset_purchase.tgz" -C "/content/"

!mv "/content/dataset_purchase" "/content/dataset_purchase.csv"


dataset_purchase


#### For purposes of reproducability set fixed random seeds for everything

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
from tqdm.notebook import tqdm
import numpy as np
import random
import torch
import os

def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

### Data
Purchase Data

Each record consists
of 600 binary features. Each feature corresponds to a product
and represents whether the user has purchased it or not

In [None]:
# code block for unlearning

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load a portion of the data for speed purposes
df = pd.read_csv('/content/dataset_purchase.csv')

set_seed(0)

target_dataset_head_rows = 50_000

train_df, _ = train_test_split(df.head(target_dataset_head_rows), test_size=0.2)

retain_df, forget_df = train_test_split(train_df, test_size=0.1)

# target_model_df = retain_df.copy()

# df.head(5)

print(target_model_df.shape)
print(retain_df.shape, forget_df.shape)
forget_df

In [8]:
import pandas as pd
import numpy as np

set_seed(42)
# load a portion of the data for speed purposes
df = pd.read_csv('/content/dataset_purchase.csv')
target_dataset_head_rows = 45_000

target_model_df = df.head(target_dataset_head_rows).copy()

# df.head(5)
print(target_model_df.shape)
target_model_df

Random seed set as 42
(45000, 601)


Unnamed: 0,63,0,1,0.1,1.1,0.2,0.3,1.2,1.3,0.4,...,0.406,0.407,0.408,0.409,1.184,0.410,0.411,0.412,0.413,0.414
0,70,0,1,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,67,0,1,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,55,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,16,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,84,0,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,6,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44996,34,0,1,0,1,1,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
44997,23,0,1,0,1,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
44998,88,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


### Split the Data

In [9]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

set_seed(42)

# drop target column (here we use the 1st column)
X = target_model_df.drop(columns=target_model_df.columns[0], axis=1)

# WARNING: because the classes are from 1 to 100 you will get an error of
# IndexError: Target 100 is out of bounds.
# In order to bring your labels to range of 0 to 99 use the follwing expression
# which subtracts 1 from every value in every row
y = target_model_df.iloc[:, 0] - 1

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape, X_test.shape)


# # plot the occurances of each class in the dataset

# y.max()
# plt.figure(figsize=(18, 10))  # Adjust the values (width, height) as per your requirement

# counts = y.value_counts()
# counts.plot(kind='bar')
# plt.show()
X_test

Random seed set as 42
(36000, 600) (9000, 600)


Unnamed: 0,0,1,0.1,1.1,0.2,0.3,1.2,1.3,0.4,0.5,...,0.406,0.407,0.408,0.409,1.184,0.410,0.411,0.412,0.413,0.414
37979,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9911,0,1,1,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
43386,0,1,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
13822,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44810,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14572,0,1,0,1,1,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
30384,0,1,1,1,0,0,1,1,0,1,...,1,0,0,1,0,0,0,0,0,1
36665,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
30569,0,0,0,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0


## Train the Target Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ExponentialLR
from tqdm.notebook import tqdm

class TargetModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TargetModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.tanh = nn.Tanh()
        self.output = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.output(x)
        return x

def train_target_nn(model, train_loader, val_loader, criterion, optimizer, epochs, patience, early_stopping = True):

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in tqdm(range(epochs), desc='Epochs'):
        model.train()
        running_loss =  0.0

        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / len(train_loader)
        current_lr = scheduler.get_last_lr()[0]

        scheduler.step()

        if early_stopping:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for inputs, labels in val_loader:
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
            val_loss /= len(val_loader)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("\nEarly stopping triggered.")
                    break
            print(f'\rTrain Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}, Learning Rate: {current_lr:.8f}', end='', flush=True)
        else:
            print(f'\rTrain Loss: {epoch_loss:.4f}, Learning Rate: {current_lr:.8f}', end='', flush=True)



if __name__ == "__main__":
    set_seed(42)

    # Set the parameters
    n_features = X_train.shape[1]
    n_classes = len(set(y_train))

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

    hidden_size = 128
    learning_rate = 0.01
    lr_decay = 1e-7
    epochs = 150
    batch_size = 32
    early_stoping_patience = 3
    early_stopping = False

    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    test_data = TensorDataset(X_test_tensor, y_test_tensor)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

    target_model = TargetModel(n_features, hidden_size, n_classes)

    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(target_model.parameters(), lr=learning_rate, momentum=0.9)
    optimizer = optim.SGD(target_model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-7)
    scheduler = ExponentialLR(optimizer, gamma= 1 -  lr_decay)

    train_target_nn(target_model, train_loader, test_loader, criterion,
                     optimizer, epochs, early_stoping_patience, early_stopping)


Random seed set as 42


Epochs:   0%|          | 0/150 [00:00<?, ?it/s]

Train Loss: 0.0034, Learning Rate: 0.00999985

## Evaluate Target Model

##### test stored models

In [None]:
# test code for loading and evaluating saved models
# FOLDER_NAME = 'target_model_pd_2b'

# folder_path = f'/content/drive/MyDrive/Python/Diplomatiki/MIA_purchase_dataset/{FOLDER_NAME}'
# target_model = torch.jit.load(f'{folder_path}/target_model.pt')

# X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
# X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)
# learning_rate = 0.01
# lr_decay = 1e-7


# criterion = nn.CrossEntropyLoss()
# # optimizer = optim.SGD(target_model.parameters(), lr=learning_rate, momentum=0.9)
# optimizer = optim.SGD(target_model.parameters(), lr=learning_rate, momentum=0.9)
# scheduler = ExponentialLR(optimizer, gamma= 1 -  lr_decay)

##### eval model

In [None]:
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score

target_model.to(X_test_tensor.device)

target_model.eval()

with torch.no_grad():
    target_train_outputs = target_model(X_train_tensor)
    target_train_loss = criterion(target_train_outputs, y_train_tensor)
    target_train_predictions = torch.argmax(target_train_outputs, dim=1)
    target_train_accuracy = torch.sum(target_train_predictions == y_train_tensor).item() / len(y_train_tensor)

print(f'Training Loss: {target_train_loss.item():.4f}, Training Accuracy: {target_train_accuracy:.4f}')

with torch.no_grad():
    target_test_outputs = target_model(X_test_tensor)
    target_test_loss = criterion(target_test_outputs, y_test_tensor)

    target_test_predictions = torch.argmax(target_test_outputs, dim=1).cpu().numpy()
    y_test_array = y_test_tensor.cpu().numpy()

    # Calculate accuracy
    target_test_accuracy = (target_test_predictions == y_test_array).mean()

    # Calculate precision and recall
    precision = precision_score(y_test_array, target_test_predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_array, target_test_predictions, average='weighted', zero_division=0)

print(f'Test Loss: {target_test_loss.item():.4f}, Accuracy: {target_test_accuracy:.4f}')
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}')


Training Loss: 0.0033, Training Accuracy: 1.0000
Test Loss: 0.3679, Accuracy: 0.8834
Precision: 0.8837, Recall: 0.8834


#### Save Target Model

In [None]:
target_model.eval()

typical_input = next(iter(train_loader))[0]  # Get a typical input tensor
traced_model = torch.jit.trace(target_model, typical_input)

# folder_name = f'target_model_with_{target_dataset_head_rows}_data'
OUTPUT_FOLDER_NAME = f'target_model_pd_1a_150_000_rows'

folder_path = f"/content/drive/MyDrive/Python/Diplomatiki/MIA_purchase_dataset/{OUTPUT_FOLDER_NAME}"
os.makedirs(folder_path, exist_ok=True)
torch.jit.save(traced_model, folder_path + "/target_model.pt")

X_train.to_csv(folder_path + "/train_dataset.csv", index=False)
# X_test.to_csv(folder_path + "/test_dataset.csv", index=False)

# save only the indexes of the rows used for training
# indexes = X_train.index.tolist()
# with open(folder_path + '/train_rows.txt', 'w') as file:
#     for idx in indexes:
#         file.write(str(idx) + '\n')

print('model and it\'s train/test datasets are saved on folder ', OUTPUT_FOLDER_NAME)

model and it's train/test datasets are saved on folder  target_model_pd_1a_150_000_rows


In [None]:
# code block for unlearning
set_seed(42)
target_model.eval()

typical_input = next(iter(train_loader))[0]  # Get a typical input tensor
traced_model = torch.jit.trace(target_model, typical_input)

# folder_name = f'target_model_with_{target_dataset_head_rows}_data'
OUTPUT_FOLDER_NAME = f'Unlearning/full_data_target_model_seed_150'

folder_path = f"/content/drive/MyDrive/Python/Diplomatiki/MIA_purchase_dataset/{OUTPUT_FOLDER_NAME}"
os.makedirs(folder_path, exist_ok=True)
torch.jit.save(traced_model, folder_path + "/full_data_target_model_seed_150.pt")

X_train.to_csv(folder_path + "/train_dataset.csv", index=False)

X_forget = forget_df.drop(columns=forget_df.columns[0], axis=1)

# X_forget.to_csv(folder_path + "/forget_dataset.csv", index=False)
# X_test.to_csv(folder_path + "/test_dataset.csv", index=False)

# save only the indexes of the rows used for training
# indexes = X_train.index.tolist()
# with open(folder_path + '/train_rows.txt', 'w') as file:
#     for idx in indexes:
#         file.write(str(idx) + '\n')

print('model and it\'s train/test datasets are saved on folder ', OUTPUT_FOLDER_NAME)
print(forget_df.shape, X_forget.shape )

model and it's train/test datasets are saved on folder  Unlearning/full_data_target_model_seed_150
(4000, 601) (4000, 600)
