说明：这个notebook演示了如何使用quantification方法（目前实现了4种方法进行隐私量化）
1. dFIL (batchsize = 1)
2. distance correlation (batchsize>=2)
3. mutual information (batchsize>=8)
4. ULoss (batchsize = 1)

注意用不同方法的时候要重新设置 批大小 （即args['batch_size']的值）

因为在整个测试集上进行隐私量化，时间太长了（可能要跑好几天）所以这里设计了一个get_one_data()函数，取测试集的前k个数据作为一个数据集，batch_size=k,因此只需要迭代一次

In [1]:
# 导包
import torch
import os
import argparse
import pandas as pd
import tqdm
import numpy as np
from torch.nn.functional import avg_pool2d
# os.environ['NUMEXPR_MAX_THREADS'] = '48'


# 导入各个指标
import sys
sys.path.append('/home/dengruijun/data/FinTech/PP-Split/')
from ppsplit.quantification.distance_correlation.distCor import distCorMetric
from ppsplit.quantification.fisher_information.dFIL_inverse import dFILInverseMetric
from ppsplit.quantification.shannon_information.mutual_information import MuInfoMetric
from ppsplit.quantification.shannon_information.ULoss import ULossMetric
from ppsplit.quantification.rep_reading.rep_reader import PCA_Reader


# 模型、数据集获取
from target_model.task_select import get_dataloader_and_model,get_dataloader_and_model, get_dataloader,get_models, get_infotopo_para

# utils
from ppsplit.utils.utils import create_dir

In [9]:
'''
Author: Ruijun Deng
Date: 2024-08-24 00:41:30
LastEditTime: 2024-08-28 05:27:58
LastEditors: Ruijun Deng
FilePath: /PP-Split/examples/InvMetrics/quantification.ipynb
Description: 
'''
# 基本参数：
# 硬件
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# 参数
# parser = argparse.ArgumentParser()
# parser.add_argument('--dataset', type = str, default = 'CIFAR10')
# parser.add_argument('--device', type = str, default = 'cuda:1')
# parser.add_argument('--batch_size',type=int, default=1) # muinfo最小为8，# distcor最小为2
# args = parser.parse_args()

# args = {
#         'device':torch.device("cuda:1" if torch.cuda.is_available() else "cpu"),
#         # 'device':torch.device("cpu"),
#         'dataset':'CIFAR10',
#         # 'dataset':'bank',
#         # 'dataset':'credit',
#         # 'dataset':'purchase',
#         # 'result_dir': 'InvMetric-202403',
#         'result_dir': '20240428-Rep-quantify/',
#         'batch_size':2,
#         'noise_scale':0, # 防护措施
#         'num_pairs': 10000, # RepE
#         }
# print(args['device'])

args = {
        'device':torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        # 'device':torch.device("cpu"),
        'dataset':'CIFAR10',
        # 'dataset':'bank',
        # 'dataset':'credit',
        # 'dataset':'purchase',
        # 'dataset':'Iris',
        # 'model': 'ResNet18',
        'model': 'VGG5',
        # 'result_dir': '20240702-FIL/',
        'result_dir': 'InvMetric-202403/',
        'oneData_bs': 500,
        'test_bs': 500,
        'train_bs': 1,
        'noise_scale': 0, # 防护措施
        'split_layer': 5,
        'test_num': 'MI', # MI, invdFIL, distCor, ULoss,  # split layer [2,3,5,7,9,11] for ResNet18
        'no_dense':True,
        }

print(args['device'])
print(args)

cuda:1


# 数据集及其模型加载

In [10]:
data_msg = get_dataloader(args)
model_msg = get_models(args)
infotopo_msg = get_infotopo_para(args)
msg = {**model_msg,**data_msg,**infotopo_msg}

# 数据集
one_data_loader,trainloader,testloader = data_msg['one_data_loader'],data_msg['trainloader'], data_msg['testloader']

# infotopo
conv = msg['conv']

# 模型和路径
client_net,decoder_net = model_msg['client_net'],model_msg['decoder_net']
decoder_route = model_msg['decoder_route']
image_deprocess = model_msg['image_deprocess']

results_dir = model_msg['results_dir']
inverse_dir = results_dir + 'layer'+str(args['split_layer'])+'/'
data_type = 1 if args['dataset'] == 'CIFAR10' else 0
split_layer = args['split_layer']

print('results_dir:',results_dir)
print('inverse_dir:',inverse_dir)
print('decoder_route:',decoder_route)

create_dir(results_dir)

# client_net使用
client_net = client_net.to(args['device'])
client_net.eval()

features.0.weight
features.0.bias
features.1.weight
features.1.bias
features.1.running_mean
features.1.running_var
features.1.num_batches_tracked
train decoder model...
results_dir: ../../results/InvMetric-202403//VGG5/MI/
inverse_dir: ../../results/InvMetric-202403//VGG5/MI/layer1/
decoder_route: ../../results/InvMetric-202403//VGG5/MI//Decoder-layer1.pth


VGG(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Tanh()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (denses): Sequential()
)

# 各种指标计算

## 1.dFIL-inverse
注意：batchsize 需要等于1

In [32]:
# dFIL inverse指标计算

eta_same_layer_list = []
eta_diff_layer_list=[]

metric = dFILInverseMetric()
# 对traingloader遍历计算所有 inverse dFIL
# for j, data in enumerate(tqdm.tqdm(testloader)):
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    # if j < 31705:
        # continue
    inputs, labels = data
    inputs, labels = inputs.to(args['device']), labels.to(args['device'])
    inputs.requires_grad_(True) # 需要求导
    
    # inference
    outputs = client_net(inputs)

    eta = metric.quantify(model=client_net, inputs=inputs, outputs=outputs, with_outputs=True)
    # 打印
    # print(str(j)+": "+str(eta.item()))
    eta_same_layer_list.append(eta)
eta_diff_layer_list.append(eta_same_layer_list)

# 结果储存到csv中
matrix = np.array(eta_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in split_layer_list]).to_csv(save_img_dir + f'dFIL-1.csv',index=False)
pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(results_dir + f'dFIL-layer{split_layer}.csv',index=False)


100%|██████████| 1/1 [00:00<00:00,  6.44it/s]


## 2. distance correlation
注意：batchsize >=2 

In [None]:
# distance correlation指标计算

distCorr_diff_layer_list = []
distCorr_same_layer_list = []
metric = distCorMetric()

# for j, data in enumerate(tqdm.tqdm(testloader)): # 对testloader遍历
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    tab, labels = data
    tab, labels = tab.to(args['device']), labels.to(args['device'])
    with torch.no_grad():
        pred = client_net(tab).cpu().detach()
        inputs = tab.cpu().detach()

        distCorr = metric.quantify(inputs=inputs,outputs=pred) # x,z
        distCorr_same_layer_list.append(distCorr)


print(f"Layer {args['split_layer']} Avg distCorr: {sum(distCorr_same_layer_list)/len(distCorr_same_layer_list)}")
distCorr_diff_layer_list.append(distCorr_same_layer_list)

# 保存到csv中
matrix = np.array(distCorr_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in range (len(split_layer_list))]).to_csv(save_img_dir + f'DLoss-bs{batch_size}.csv',index=False)
pd.DataFrame(data=transpose, columns=[args['split_layer']]).to_csv(results_dir + f'DLoss.csv',index=False)
print(results_dir + f'DLoss.csv')

## 3. mutual information
注意：batchsize>=8

In [11]:
# mutual information指标计算

MI_diff_layer_list = []
MI_same_layer_list = []
metric = MuInfoMetric()
avg_MI = []

# for e in range(20):
# for j, data in enumerate(tqdm.tqdm(testloader)): # 对testloader遍历
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    images, labels = data
    images, labels = images.to(args['device']), labels.to(args['device'])
    with torch.no_grad():
        # inference
        if conv:
            print('images: ', images.shape)
            images= avg_pool2d(images,kernel_size=4)
            print('images_pooled: ',images.shape)

        outputs = client_net(images).clone().detach()
        inputs = images.cpu().detach()
        mi = metric.quantify(inputs=inputs, outputs = outputs)
        MI_same_layer_list.append(mi)
        
print(f"Layer {args['split_layer']} MI: {sum(MI_same_layer_list)/len(MI_same_layer_list)}")
MI_diff_layer_list.append(MI_same_layer_list)


# 保存到csv中
matrix = np.array(MI_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in split_layer_list]).to_csv(results_dir + f'MI-bs{batch_size}.csv',index=False)
# pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(results_dir + f'MILoss-layer{split_layer}.csv',index=False)
save_route = results_dir + f'MI.csv'
if os.path.exists(save_route):
    df = pd.read_csv(save_route)
    df[args['split_layer']] = transpose
    df.to_csv(save_route,index=False)
else:
    pd.DataFrame(data=transpose, columns=[args['split_layer']]).to_csv(save_route,index=False)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [10:49<00:00, 649.47s/it]

Layer 1 MI: 878.9487438395895





## 4. Uncertainty Loss
注意：batchsize=1

In [9]:
# mutual information指标计算

ULoss_diff_layer_list = []
ULoss_same_layer_list = []
metric = ULossMetric()
decoder_net = torch.load(decoder_route)
decoder_net.to(args['device'])
decoder_net.eval()


# for j, data in enumerate(tqdm.tqdm(testloader)): # 对testloader遍历
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    images, labels = data
    images, labels = images.to(args['device']), labels.to(args['device'])
    with torch.no_grad():
        # inference
        outputs = client_net(images).clone().detach()
        uloss = metric.quantify(output = outputs, decoder_net=decoder_net)
        ULoss_same_layer_list.append(uloss)
        
print(f"Layer {split_layer} ULoss: {sum(ULoss_same_layer_list)/len(ULoss_same_layer_list)}")
ULoss_diff_layer_list.append(ULoss_same_layer_list)


# 保存到csv中
matrix = np.array(ULoss_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in split_layer_list]).to_csv(results_dir + f'ULoss-bs{batch_size}.csv',index=False)
pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(results_dir + f'ULoss.csv',index=False)


FileNotFoundError: [Errno 2] No such file or directory: '../../results/InvMetric-202403//Iris/MI//Decoder-layer3.pth'

## 5. RepE Reader
无所谓bs

1 格式化检查

2 实例化一个finder

3 模型推理数据得到hidden state（中途有些处理）

4 训练pca得到direction

5 转换，nparray转换成浮点数

6 如果有train label，就get sign一下

评估维度性，对label的贡献（偏泄漏隐私，还是偏不泄漏隐私）

你directionality 反应的你对于泄漏方向的偏移，和真实的标签？   

7 分析测试数据的隐私泄漏程度（在每一层的隐私泄漏程度）


In [None]:
# 导包
from target_model.data_preprocessing.dataset import pair_smashed_data,diff_pair_data
from target_model.data_preprocessing.preprocess_cifar10 import get_cifar10_normalize_two_train
import random
import time
import pickle
from torch.utils.data import DataLoader

In [None]:
# 1. designing stimulus and test
dataset_route = f"../results/{args['result_dir']}/VGG5/quantification/{args['num_pairs']}pairs/"
if os.path.isfile(dataset_route+'train_feature.pkl'): # 直接加载预处理好的数据集
    print(f"=> loading paired dataset from {dataset_route}")
    with open(dataset_route+'train_feature.pkl','rb') as f:
        train_feature = pickle.load(file=f)       
    with open(dataset_route+'test_feature.pkl','rb') as f:
        test_feature=pickle.load(file=f)   
    with open(dataset_route+'train_label.pkl','rb') as f:                                                       
        train_labels=pickle.load(file=f)      
    with open(dataset_route+'test_label.pkl', 'rb') as f:                                                    
        test_labels=pickle.load(file=f)     
    train_loader= DataLoader(train_feature,shuffle=False,batch_size=1)
    test_loader = DataLoader(test_feature,shuffle=False,batch_size=1)       
# if False:
#     pass
else: # 进行预处理并存储
    seen_loader,unseen_loader,_ = get_cifar10_normalize_two_train(batch_size=1)

    train_loader,train_labels,test_loader,test_labels = pair_smashed_data(seen_loader,
                                                                        unseen_loader,
                                                                        num_pairs=args['num_pairs'])
    create_dir(dataset_route)
    with open(dataset_route+'train_feature.pkl','wb') as f:
        pickle.dump(obj=train_loader.dataset,file=f)       
    with open(dataset_route+'test_feature.pkl','wb') as f:
        pickle.dump(obj=test_loader.dataset,file=f)   
    with open(dataset_route+'train_label.pkl','wb') as f:                                                       
        pickle.dump(obj=train_labels,file=f)      
    with open(dataset_route+'test_label.pkl', 'wb') as f:                                                    
        pickle.dump(obj=test_labels,file=f)                                                          

print(train_labels[0])
print(test_labels[0].index(1))


In [None]:
# 2. collecting neural activity

# #Picking the top X probabilities 
def clipDataTopX(dataToClip, top=3):
    sorted_indices = torch.argsort(dataToClip,dim=1,descending=True)[:,:3]
    new_data = torch.gather(dataToClip,1,sorted_indices)
    
	# res = [sorted(s, reverse=True)[0:top] for s in dataToClip ]
	# return np.array(res)
    # print(new_data[0])
    return new_data

# 收集所有smashed data
train_smashed_data_list = []
i = 1
for j, data in enumerate(tqdm.tqdm(train_loader)): # 对trainloader遍历
    # print("data: ", len(data))
    features=data.to(args['device'])
    
    with torch.no_grad():
        pred = client_net(features)
        # pred_topk = sorted(pred, reverse=True)[0:5]
        # train_smashed_data_list.append(pred)
        train_smashed_data_list.append(pred)

train_smashed_data_list=torch.stack(train_smashed_data_list).squeeze()
# 拉成 [batchsize, vectorsize]的二维矩阵
train_smashed_data_list=train_smashed_data_list.reshape(train_smashed_data_list.shape[0],-1)
train_smashed_data_list = clipDataTopX(train_smashed_data_list,top=10)
# 相对距离
diff_data = diff_pair_data(train_smashed_data_list) # np.array
print("diff_data.shape: ", diff_data.shape)

In [None]:

# 3. constructing a linear model
# 训练direction finder
reader = PCA_Reader(n_components=1) # 要的是numpy数据？可以要tensor数据
# diff_data = diff_data.reshape(diff_data.shape[0],-1)
directions = reader.get_rep_direction(diff_data)
signs = reader.get_sign(hidden_states=train_smashed_data_list,train_labels=train_labels)
print('direction shape of first layer: ', reader.direction.shape)
print('signs of first layer: ', reader.direction_signs)


In [None]:
# 4. 测试
test_smashed_data_list = []
for j, data in enumerate(tqdm.tqdm(test_loader)): # 对trainloader遍历
    features=data.to(args['device'])
    with torch.no_grad():
        pred = client_net(features)
        test_smashed_data_list.append(pred)

test_smashed_data_list=torch.stack(test_smashed_data_list).squeeze()
test_smashed_data_list=test_smashed_data_list.reshape(test_smashed_data_list.shape[0],-1)
test_smashed_data_list = clipDataTopX(test_smashed_data_list,top=10)
acc = reader.quantify_acc(hidden_states=test_smashed_data_list,test_labels=test_labels)
print(f"quantified accuracy(privacy lekage): {acc} ")

In [None]:
x = [torch.Tensor([1,2]),torch.Tensor([3,4])]
y = [torch.Tensor([5,6]),torch.Tensor([7,8])]
l = [x,y]
print(l)
l1 = [torch.stack(i) for i in l]
print(l1)
l2 = [item for sublist in l for item in sublist]
print(l2)

# list[tensor]转tensor
tensor_list = [torch.tensor([1, 2, 3]), torch.tensor([4, 5, 6]), torch.tensor([7, 8, 9])]
tensor_stack = torch.stack(tensor_list)
print(tensor_stack)

# 看两个tensor比较
x = torch.Tensor([1,2])
y = torch.Tensor([3,4])
print(x==y)
x.reshape(1,-1)
print(x.reshape(1,-1))